From b09d42f2fcd3bfaf83e3bc88db54ca0b7c3f4210 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 1 Feb 2023 09:34:47 -0500
Subject: [PATCH 01/81] Adding support for sse2neon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |   14 +
 ext/sse2neon/CMakeLists.txt                 |    9 +
 ext/sse2neon/src/include/sse2neon.h         | 9079 +++++++++++++++++++
 share/cmake/utils/CheckSupportARMNeon.cmake |   21 +
 share/cmake/utils/CheckSupportSSE2.cmake    |   35 +-
 share/cmake/utils/CompilerFlags.cmake       |   30 +-
 src/OpenColorIO/CMakeLists.txt              |   24 +-
 src/OpenColorIO/SSE.h                       |    8 +-
 tests/cpu/CMakeLists.txt                    |   17 +
 9 files changed, 9204 insertions(+), 33 deletions(-)
 create mode 100644 ext/sse2neon/CMakeLists.txt
 create mode 100644 ext/sse2neon/src/include/sse2neon.h
 create mode 100644 share/cmake/utils/CheckSupportARMNeon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3911a154ce..190afe7426 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,6 +158,20 @@ option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ociocon
 include(CheckSupportGL)
 
 
+###############################################################################
+# Check for ARM neon intrinsics (armv8)
+
+include(CheckSupportARMNeon)
+
+
+###############################################################################
+# Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
+
+if(HAVE_NEON)
+    add_subdirectory(ext/sse2neon)
+endif()
+
+
 ###############################################################################
 # Define compilation and link flags
 
diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
new file mode 100644
index 0000000000..7fd5fcb302
--- /dev/null
+++ b/ext/sse2neon/CMakeLists.txt
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+# sse2neon (modified)
+# https://github.com/DLTcollab/sse2neon
+add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+set_target_properties(sse2neon PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
+)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
new file mode 100644
index 0000000000..164c6c3387
--- /dev/null
+++ b/ext/sse2neon/src/include/sse2neon.h
@@ -0,0 +1,9079 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#else /* non-GNU / non-clang compilers */
+#warning "Macro name collisions may happen with unsupported compiler."
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(_WIN32)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+ * from both MinGW-w64 and MSVC.
+ */
+#define SSE2NEON_ALLOC_DEFINED
+#endif
+
+/* If using MSVC */
+#ifdef _MSC_VER
+#include <intrin.h>
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+    (defined(_M_ARM) || defined(__arm__))
+#define SSE2NEON_HAS_BITSCAN64
+#endif
+#endif
+
+/* Compiler barrier */
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+#if !defined(__aarch64__) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchtectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if !defined(__aarch64__)
+#include <sys/time.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+
+// __builtin_shuffle introduced in GCC 4.7.0
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+#define HAS__builtin_shuffle 1
+#else
+#define HAS__builtin_shuffle 0
+#endif
+
+#define HAS__builtin_shufflevector 0
+#define HAS__builtin_nontemporal_store 0
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif __has_builtin(__builtin_shuffle)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __extension__({                        \
+        type tmp = {__VA_ARGS__};          \
+        __builtin_shuffle(a, b, tmp);      \
+    })
+#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128();
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddv u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#else
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+#else
+// Wraps vaddvq_u8
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    return vaddvq_u8(a);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u16 variant */
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    uint32x4_t m = vpaddlq_u16(a);
+    uint64x2_t n = vpaddlq_u32(m);
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
+
+    return vget_lane_u32((uint32x2_t) o, 0);
+}
+#else
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+#if defined(__ARM_FEATURE_CRYPTO) && \
+    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+#endif
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+#if defined(aarch64__)
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+#else
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+#endif
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache heirarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr()
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                           \
+    __extension__({                                                        \
+        vreinterpret_m64_s16(vshuffle_s16(                                 \
+            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
+    })
+#else
+#define _mm_shuffle_pi16(a, imm)                                               \
+    __extension__({                                                            \
+        int16x4_t ret;                                                         \
+        ret =                                                                  \
+            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
+            1);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
+            2);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
+            3);                                                                \
+        vreinterpret_m64_s16(ret);                                             \
+    })
+#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 >= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 > *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 <= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 < *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
+                                                 vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double ret = *((double *) &a);
+    return (int32_t) ret;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+#else
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+#endif
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+#else
+    return _mm_set_pd(0, a);
+#endif
+}
+
+// Broadcast 16-bit integer a to all all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                         \
+    __extension__({                                                    \
+        int8x16_t ret;                                                 \
+        if (_sse2neon_unlikely(imm == 0))                              \
+            ret = vreinterpretq_s8_m128i(a);                           \
+        else if (_sse2neon_unlikely((imm) & ~15))                      \
+            ret = vdupq_n_s8(0);                                       \
+        else                                                           \
+            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
+                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
+        vreinterpretq_m128i_s8(ret);                                   \
+    })
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) == 0)) {                                \
+            ret = a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~15)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u16(                                   \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~31)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u32(                                   \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~63)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u64(                                   \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                       \
+    __extension__({                                                  \
+        int8x16_t ret;                                               \
+        if (_sse2neon_unlikely((imm) & ~15))                         \
+            ret = vdupq_n_s8(0);                                     \
+        else                                                         \
+            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
+                           (imm > 15 ? 0 : imm));                    \
+        vreinterpretq_m128i_s8(ret);                                 \
+    })
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+{
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (_sse2neon_unlikely((imm) >= 16)) {                              \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+#else
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
+    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(floor(f[1]), floor(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                              \
+    __extension__({                                                            \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
+                           vreinterpretq_f32_m128(a), 0);                      \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
+                           ((imm8 >> 4) & 0x3));                               \
+        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        vreinterpretq_m128_f32(                                                \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
+    })
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+#if defined(__aarch64__)
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+#else
+    // Find the minimum value
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+#endif
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+#if defined(__aarch64__)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t zf =
+        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t cf =
+        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t result = vandq_u64(zf, cf);
+    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+#if defined(__aarch64__)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#else
+        uint64x2_t sumh = vpaddlq_u32(vec_res);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+    case _SIDD_NEGATIVE_POLARITY:
+        res ^= 0xffffffff;
+        break;
+    case _SIDD_MASKED_NEGATIVE_POLARITY:
+        res ^= (1 << lb) - 1;
+        break;
+    default:
+        break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_clz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+#if _MSC_VER
+    unsigned long cnt;
+#ifdef defined(SSE2NEON_HAS_BITSCAN64)
+    (defined(_M_AMD64) || defined(__x86_64__))
+        if((_BitScanForward64(&cnt, x))
+            return (int)(cnt);
+#else
+    if (_BitScanForward(&cnt, (unsigned long) (x)))
+        return (int) cnt;
+    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+        return (int) (cnt + 32);
+#endif
+    return 64;
+#else
+    return x != 0 ? __builtin_ctzll(x) : 64;
+#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        } else {                                                               \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    } else {                                                                   \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        } else {                                                               \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        } else {                                                         \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+#if !defined(__aarch64__)
+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define SSE2NEON_MULTIPLY(x, y)                                  \
+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A implementation for a table-based AES */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+// muliplying 'x' by 2 in GF(2^8)
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+// muliplying 'x' by 3 in GF(2^8)
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+
+    // this generates a table containing every possible permutation of
+    // shift_rows() and sub_bytes() with mix_columns().
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
+    uint32_t x1 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
+    uint32_t x2 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
+    uint32_t x3 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
+
+    // finish the modulo addition step in mix_columns()
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // muliplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
+                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t i, e, f, g, h, v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    // inverse mix columns
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A implementation */
+    uint8_t v[16] = {
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+    };
+
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (int i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+#if defined(__aarch64__)
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+
+#else /* ARMv7-A NEON implementation */
+    uint8_t i, e, f, g, h, v[4][4];
+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+#endif
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+#if defined(__aarch64__)
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+
+#else /* ARMv7-A NEON implementation */
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+#endif
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
+        vreinterpretq_u8_m128i(RoundKey));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
\ No newline at end of file
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
new file mode 100644
index 0000000000..1123e75f52
--- /dev/null
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+include(CheckCXXSourceCompiles)
+
+set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
+set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+fp+simd+crypto+crc")
+
+check_cxx_source_compiles ("
+    #include <arm_neon.h>
+    int main()
+    {
+        float32x4_t v = vdupq_n_f32(0);
+        return 0;
+    }"
+    HAVE_NEON)
+
+set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
+unset(_cmake_required_flags_old)
+
+mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index f30bbb763c..5abd707cdb 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -4,6 +4,7 @@
 include(CheckCXXSourceCompiles)
 
 set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
+set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
@@ -16,20 +17,30 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     endif()
 endif()
 
-check_cxx_source_compiles ("
-    #include <emmintrin.h>
-    int main ()
-    {
-        __m128d a, b;
-        double vals[2] = {0};
-        a = _mm_loadu_pd (vals);
-        b = _mm_add_pd (a,a);
-        _mm_storeu_pd (vals,b);
-        return (0);
-    }"
-    HAVE_SSE2)
+set(_SSE2_HEADER "#include <emmintrin.h>")
+if (HAVE_NEON)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
+    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+    set(_SSE2_HEADER "#include <sse2neon.h>")
+endif()
+
+set(_SSE2_TEST_SOURCE_CODE "
+${_SSE2_HEADER}
+int main ()
+{
+    __m128d a, b;
+    double vals[2] = {0};
+    a = _mm_loadu_pd (vals);
+    b = _mm_add_pd (a,a);
+    _mm_storeu_pd (vals,b);
+    return (0);
+}")
+
+check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" HAVE_SSE2)
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
+set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
 unset(_cmake_required_flags_old)
+unset(_cmake_required_libraries_old)
 
 mark_as_advanced(HAVE_SSE2)
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 9e56d55ae8..1ee16596bd 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -7,6 +7,20 @@
 
 set(PLATFORM_COMPILE_FLAGS "")
 
+###############################################################################
+# Define if SSE2 can be used.
+# Check for SSE2 first since some compile flags need to be set on Apple ARM.
+
+include(CheckSupportSSE2)
+
+if(NOT HAVE_SSE2)
+    message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
+    set(OCIO_USE_SSE OFF)
+endif(NOT HAVE_SSE2)
+
+###############################################################################
+# Compile flags
+
 if(USE_MSVC)
 
     set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /DUSE_MSVC")
@@ -39,7 +53,9 @@ elseif(USE_CLANG)
 
     # Use of 'register' specifier must be removed for C++17 support.
     set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wno-deprecated-register")
-
+    if (HAVE_SSE2 AND HAVE_NEON)
+        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
+    endif()
 elseif(USE_GCC)
 
     set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -DUSE_GCC")
@@ -82,18 +98,6 @@ set_unless_defined(CMAKE_C_VISIBILITY_PRESET hidden)
 set_unless_defined(CMAKE_CXX_VISIBILITY_PRESET hidden)
 set_unless_defined(CMAKE_VISIBILITY_INLINES_HIDDEN YES)
 
-
-###############################################################################
-# Define if SSE2 can be used.
-
-include(CheckSupportSSE2)
-
-if(NOT HAVE_SSE2)
-    message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
-    set(OCIO_USE_SSE OFF)
-endif(NOT HAVE_SSE2)
-
-
 ###############################################################################
 # Define RPATH.
 
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 1c4d774ddb..43a2ca75fb 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,6 +292,13 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
+if(OCIO_USE_SSE AND HAVE_NEON)
+    target_link_libraries(OpenColorIO
+        PRIVATE
+            "$<BUILD_INTERFACE:sse2neon>"
+    )
+endif()
+
 if(APPLE)
     target_link_libraries(OpenColorIO
         PRIVATE
@@ -327,6 +334,13 @@ if(OCIO_USE_SSE)
         PRIVATE
             USE_SSE
     )
+
+    if(HAVE_NEON)
+        target_compile_definitions(OpenColorIO
+            PRIVATE
+                USE_SSE2NEON
+        )
+    endif()
 endif()
 
 if(MSVC AND BUILD_TYPE_DEBUG AND BUILD_SHARED_LIBS)
@@ -354,11 +368,11 @@ if(WIN32)
 endif()
 
 set_target_properties(OpenColorIO PROPERTIES
-    OUTPUT_NAME   ${PROJECT_NAME}${OCIO_LIBNAME_SUFFIX}
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
-    VERSION       ${OpenColorIO_VERSION}
-    SOVERSION     ${SOVERSION}
-    PUBLIC_HEADER "${INSTALL_HEADERS}"
+    OUTPUT_NAME     ${PROJECT_NAME}${OCIO_LIBNAME_SUFFIX}
+    COMPILE_FLAGS   "${PLATFORM_COMPILE_FLAGS}"
+    VERSION         ${OpenColorIO_VERSION}
+    SOVERSION       ${SOVERSION}
+    PUBLIC_HEADER   "${INSTALL_HEADERS}"
 )
 
 if(UNIX AND NOT APPLE)
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index a903120ec7..71b7faf6f9 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -8,11 +8,13 @@
 
 #ifdef USE_SSE
 
-
-#include <emmintrin.h>
+#ifndef USE_SSE2NEON
+    #include <emmintrin.h>
+#else
+    #include <sse2neon.h>
+#endif
 #include <stdio.h>
 
-
 #include <OpenColorIO/OpenColorIO.h>
 
 
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index e78a86bede..102bed48c6 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -27,6 +27,13 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
+    if(OCIO_USE_SSE AND HAVE_NEON)
+        target_link_libraries(${TEST_BINARY}
+            PRIVATE
+                sse2neon
+        )
+    endif()
+
     if(APPLE)
         # Frameworks needed to access the ICC monitor profile. 
         target_link_libraries(${TEST_BINARY}
@@ -43,12 +50,21 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
                 "${CMAKE_BINARY_DIR}/generated_include"
         )
     endif(PRIVATE_INCLUDES)
+
     if(OCIO_USE_SSE)
         target_compile_definitions(${TEST_BINARY}
             PRIVATE
                 USE_SSE
         )
+
+        if(HAVE_NEON)
+            target_compile_definitions(${TEST_BINARY}
+                PRIVATE
+                    USE_SSE2NEON
+            )
+        endif()
     endif(OCIO_USE_SSE)
+
     if(WIN32)
         # A windows application linking to eXpat static libraries must
         # have the global macro XML_STATIC defined
@@ -66,6 +82,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             )
         endif()
     endif(WIN32)
+
     set_target_properties(${TEST_BINARY} PROPERTIES
         COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
 

From 60434ac3700f37e08d5465f00fee092cb81d9afa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 9 Feb 2023 10:48:28 -0500
Subject: [PATCH 02/81] Overwriting sse2neon implementation for SSE2 _mm_max_ps
 and _mm_min_ps to fix issues with NaN handling. Fixing issues in the unit
 tests for some tests. For some tests, the neon implementation is matching the
 C++ implementation instead of the SSE2 implementation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/SSE.h                | 29 ++++++++++++++++++++++++++++
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 20 ++++++++++++++-----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 71b7faf6f9..567c5a1c64 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -22,6 +22,35 @@
 namespace OCIO_NAMESPACE
 {
 
+// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
+// it is redefining two of the functions from sse2neon.
+#ifdef USE_SSE2NEON
+    // Overwrite the translation of _mm_max_ps and _mm_min_ps.
+    // Using vmaxnmq_f32 and vminnmq_f32 instead.
+
+    // Compare packed single-precision (32-bit) floating-point elements in a and b,
+    // and store packed maximum values in dst. dst does not follow the IEEE Standard
+    // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+    // signed-zero values.
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+    static inline __m128 _mm_max_ps(__m128 a, __m128 b)
+    {
+        return vreinterpretq_m128_f32(
+            vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    }
+
+    // Compare packed single-precision (32-bit) floating-point elements in a and b,
+    // and store packed minimum values in dst. dst does not follow the IEEE Standard
+    // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+    // signed-zero values.
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+    static inline __m128 _mm_min_ps(__m128 a, __m128 b)
+    {
+        return vreinterpretq_m128_f32(
+            vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    }
+#endif
+
 // Macros for alignment declarations
 #define OCIO_SIMD_BYTES 16
 #if defined( _MSC_VER )
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 79649a23ae..7ac48ed1f6 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -130,7 +130,9 @@ void TestAntiLog(float logBase)
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[8], inf);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[8], inf);
+    #endif
 #else
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
@@ -138,7 +140,9 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
@@ -269,7 +273,9 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[8], inf);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[8], inf);
+    #endif
 #else
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
@@ -280,7 +286,9 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
 #ifdef USE_SSE
-    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
@@ -579,7 +587,9 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6],  1.16f, 10.0f * error);
     OCIO_CHECK_EQUAL(rgba[8], -inf);
 #ifdef USE_SSE
-    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[9], inf);
 #endif

From 421459e867f23bcb4628b5e573643215659c89f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 10 Feb 2023 14:36:26 -0500
Subject: [PATCH 03/81] Changed how we handle sse2neon to match other
 dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                |   11 +-
 ext/sse2neon/CMakeLists.txt                   |    9 -
 ext/sse2neon/src/include/sse2neon.h           | 9079 -----------------
 share/cmake/modules/Findsse2neon.cmake        |   58 +
 .../modules/install/Installsse2neon.cmake     |   36 +
 src/OpenColorIO/CMakeLists.txt                |    2 +-
 6 files changed, 105 insertions(+), 9090 deletions(-)
 delete mode 100644 ext/sse2neon/CMakeLists.txt
 delete mode 100644 ext/sse2neon/src/include/sse2neon.h
 create mode 100644 share/cmake/modules/Findsse2neon.cmake
 create mode 100644 share/cmake/modules/install/Installsse2neon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 190afe7426..7a2c1304a1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,7 @@ set(CMAKE_MODULE_PATH
     ${CMAKE_SOURCE_DIR}/share/cmake/utils
     ${CMAKE_SOURCE_DIR}/share/cmake/macros
     ${CMAKE_SOURCE_DIR}/share/cmake/modules
+    ${CMAKE_SOURCE_DIR}/share/cmake/modules/install
 )
 
 set(CMAKE_WARN_DEPRECATED ON)
@@ -168,7 +169,15 @@ include(CheckSupportARMNeon)
 # Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
 
 if(HAVE_NEON)
-    add_subdirectory(ext/sse2neon)
+    # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
+    # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
+
+    # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
+    # recommended version branch is merged).
+    find_package(sse2neon QUIET)
+    if (NOT sse2neon_FOUND)
+        include(Installsse2neon)
+    endif()
 endif()
 
 
diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
deleted file mode 100644
index 7fd5fcb302..0000000000
--- a/ext/sse2neon/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-
-# sse2neon (modified)
-# https://github.com/DLTcollab/sse2neon
-add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-set_target_properties(sse2neon PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
-)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
deleted file mode 100644
index 164c6c3387..0000000000
--- a/ext/sse2neon/src/include/sse2neon.h
+++ /dev/null
@@ -1,9079 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// Contributors to this work are:
-//   John W. Ratcliff <jratcliffscarab@gmail.com>
-//   Brandon Rowlett <browlett@nvidia.com>
-//   Ken Fast <kfast@gdeb.com>
-//   Eric van Beurden <evanbeurden@nvidia.com>
-//   Alexander Potylitsin <apotylitsin@nvidia.com>
-//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
-//   Jim Huang <jserv@ccns.ncku.edu.tw>
-//   Mark Cheng <marktwtn@gmail.com>
-//   Malcolm James MacLeod <malcolm@gulden.com>
-//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
-//   Sebastian Pop <spop@amazon.com>
-//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
-//   Danila Kutenin <danilak@google.com>
-//   François Turban (JishinMaster) <francois.turban@gmail.com>
-//   Pei-Hsuan Hung <afcidk@gmail.com>
-//   Yang-Hao Yuan <yuanyanghau@gmail.com>
-//   Syoyo Fujita <syoyo@lighttransport.com>
-//   Brecht Van Lommel <brecht@blender.org>
-//   Jonathan Hue <jhue@adobe.com>
-//   Cuda Chen <clh960524@gmail.com>
-//   Aymen Qader <aymen.qader@arm.com>
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Tunable configurations */
-
-/* Enable precise implementation of math operations
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
- */
-/* _mm_min|max_ps|ss|pd|sd */
-#ifndef SSE2NEON_PRECISE_MINMAX
-#define SSE2NEON_PRECISE_MINMAX (0)
-#endif
-/* _mm_rcp_ps and _mm_div_ps */
-#ifndef SSE2NEON_PRECISE_DIV
-#define SSE2NEON_PRECISE_DIV (0)
-#endif
-/* _mm_sqrt_ps and _mm_rsqrt_ps */
-#ifndef SSE2NEON_PRECISE_SQRT
-#define SSE2NEON_PRECISE_SQRT (0)
-#endif
-/* _mm_dp_pd */
-#ifndef SSE2NEON_PRECISE_DP
-#define SSE2NEON_PRECISE_DP (0)
-#endif
-
-/* compiler specific definitions */
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
-#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
-#else /* non-GNU / non-clang compilers */
-#warning "Macro name collisions may happen with unsupported compiler."
-#ifndef FORCE_INLINE
-#define FORCE_INLINE static inline
-#endif
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#define _sse2neon_likely(x) (x)
-#define _sse2neon_unlikely(x) (x)
-#endif
-
-/* C language does not allow initializing a variable with a function call. */
-#ifdef __cplusplus
-#define _sse2neon_const static const
-#else
-#define _sse2neon_const const
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#if defined(_WIN32)
-/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
- * from both MinGW-w64 and MSVC.
- */
-#define SSE2NEON_ALLOC_DEFINED
-#endif
-
-/* If using MSVC */
-#ifdef _MSC_VER
-#include <intrin.h>
-#if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM) || defined(__arm__))
-#define SSE2NEON_HAS_BITSCAN64
-#endif
-#endif
-
-/* Compiler barrier */
-#define SSE2NEON_BARRIER()                     \
-    do {                                       \
-        __asm__ __volatile__("" ::: "memory"); \
-        (void) 0;                              \
-    } while (0)
-
-/* Memory barriers
- * __atomic_thread_fence does not include a compiler barrier; instead,
- * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
- * semantics.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-#include <stdatomic.h>
-#endif
-
-FORCE_INLINE void _sse2neon_smp_mb(void)
-{
-    SSE2NEON_BARRIER();
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
-    !defined(__STDC_NO_ATOMICS__)
-    atomic_thread_fence(memory_order_seq_cst);
-#elif defined(__GNUC__) || defined(__clang__)
-    __atomic_thread_fence(__ATOMIC_SEQ_CST);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
- */
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("fpu=neon")
-#endif
-#elif defined(__aarch64__)
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("+simd")
-#endif
-#elif __ARM_ARCH == 8
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error \
-    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#endif
-#else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
-#endif
-#endif
-
-#include <arm_neon.h>
-#if !defined(__aarch64__) && (__ARM_ARCH == 8)
-#if defined __has_include && __has_include(<arm_acle.h>)
-#include <arm_acle.h>
-#endif
-#endif
-
-/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
- * and other Arm microarchtectures use.
- * From sysctl -a on Apple M1:
- * hw.cachelinesize: 128
- */
-#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
-#define SSE2NEON_CACHELINE_SIZE 128
-#else
-#define SSE2NEON_CACHELINE_SIZE 64
-#endif
-
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
-#include <math.h>
-#endif
-
-/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
- * or even not accessible in user mode.
- * To write or access to these registers in user mode,
- * we have to perform syscall instead.
- */
-#if !defined(__aarch64__)
-#include <sys/time.h>
-#endif
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if defined(__GNUC__) && (__GNUC__ <= 9)
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-
-// __builtin_shuffle introduced in GCC 4.7.0
-#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
-#define HAS__builtin_shuffle 1
-#else
-#define HAS__builtin_shuffle 0
-#endif
-
-#define HAS__builtin_shufflevector 0
-#define HAS__builtin_nontemporal_store 0
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
-    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-#if __has_builtin(__builtin_shufflevector)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __builtin_shufflevector(a, b, __VA_ARGS__)
-#elif __has_builtin(__builtin_shuffle)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __extension__({                        \
-        type tmp = {__VA_ARGS__};          \
-        __builtin_shuffle(a, b, tmp);      \
-    })
-#endif
-
-#ifdef _sse2neon_shuffle
-#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
-#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
-#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
-#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
-#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
-#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
-#endif
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-#define _MM_FROUND_RAISE_EXC 0x00
-#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
-#define _MM_ROUND_NEAREST 0x0000
-#define _MM_ROUND_DOWN 0x2000
-#define _MM_ROUND_UP 0x4000
-#define _MM_ROUND_TOWARD_ZERO 0x6000
-/* Flush zero mode macros. */
-#define _MM_FLUSH_ZERO_MASK 0x8000
-#define _MM_FLUSH_ZERO_ON 0x8000
-#define _MM_FLUSH_ZERO_OFF 0x0000
-/* Denormals are zeros mode macros. */
-#define _MM_DENORMALS_ZERO_MASK 0x0040
-#define _MM_DENORMALS_ZERO_ON 0x0040
-#define _MM_DENORMALS_ZERO_OFF 0x0000
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef int64x1_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-// __int64 is defined in the Intrinsics Guide which maps to different datatype
-// in different data model
-#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
-#if (defined(__x86_64__) || defined(__i386__))
-#define __int64 long long
-#else
-#define __int64 int64_t
-#endif
-#endif
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
-#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64_s64(x) (x)
-
-#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
-#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
-#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
-
-#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64(x) (x)
-
-#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
-
-#if defined(__aarch64__)
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
-
-#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
-#define vreinterpretq_m128d_f64(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
-
-#define vreinterpretq_f64_m128d(x) (x)
-#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
-#else
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128d_f32(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_f32_m128d(x) (x)
-#endif
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an __m128 struct
-// directly.  It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://learn.microsoft.com/en-us/cpp/cpp/m128
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides.  This union should really only be used when
-// trying to access the members of the vector as integer values.  GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit.  If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components.  The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
-    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
-    int8_t m128_i8[16];    // as signed 8-bit integers.
-    int16_t m128_i16[8];   // as signed 16-bit integers.
-    int32_t m128_i32[4];   // as signed 32-bit integers.
-    int64_t m128_i64[2];   // as signed 64-bit integers.
-    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
-    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
-    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
-    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
-
-/* SSE macros */
-#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
-#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
-#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
-#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
-
-// Function declaration
-// SSE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
-FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
-FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
-FORCE_INLINE __m128 _mm_set_ps1(float);
-FORCE_INLINE __m128 _mm_setzero_ps(void);
-// SSE2
-FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_castps_si128(__m128);
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
-FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
-FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
-FORCE_INLINE __m128d _mm_set_pd(double, double);
-FORCE_INLINE __m128i _mm_set1_epi32(int);
-FORCE_INLINE __m128i _mm_setzero_si128();
-// SSE4.1
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
-FORCE_INLINE __m128 _mm_ceil_ps(__m128);
-FORCE_INLINE __m128d _mm_floor_pd(__m128d);
-FORCE_INLINE __m128 _mm_floor_ps(__m128);
-FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
-FORCE_INLINE __m128 _mm_round_ps(__m128, int);
-// SSE4.2
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__) &&                        \
-    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
-     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
-     (__GNUC__ <= 9 && defined(__aarch64__)))
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    uint8x16x4_t ret;
-    ret.val[0] = vld1q_u8(p + 0);
-    ret.val[1] = vld1q_u8(p + 16);
-    ret.val[2] = vld1q_u8(p + 32);
-    ret.val[3] = vld1q_u8(p + 48);
-    return ret;
-}
-#else
-// Wraps vld1q_u8_x4
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    return vld1q_u8_x4(p);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddv u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
-    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
-}
-#else
-// Wraps vaddv_u8
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    return vaddv_u8(v8);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
-    uint8_t res = 0;
-    for (int i = 0; i < 8; ++i)
-        res += tmp[i];
-    return res;
-}
-#else
-// Wraps vaddvq_u8
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    return vaddvq_u8(a);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u16 variant */
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    uint32x4_t m = vpaddlq_u16(a);
-    uint64x2_t n = vpaddlq_u32(m);
-    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
-
-    return vget_lane_u32((uint32x2_t) o, 0);
-}
-#else
-// Wraps vaddvq_u16
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    return vaddvq_u16(a);
-}
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- *   _mm_<name>_<data_type>
- *
- * The parts of this format are given as follows:
- * 1. <name> describes the operation performed by the intrinsic
- * 2. <data_type> identifies the data type of the function's primary arguments
- *
- * This last part, <data_type>, is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- *                      than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- *   // Set packed 8-bit integers
- *   // 128 bits, 16 chars, per 8 bits
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
- *   // Shuffle packed 8-bit integers
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- */
-
-/* Constants for use with _mm_prefetch. */
-enum _mm_hint {
-    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
-    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
-    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
-    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
-};
-
-// The bit field mapping to the FPCR(floating-point control register)
-typedef struct {
-    uint16_t res0;
-    uint8_t res1 : 6;
-    uint8_t bit22 : 1;
-    uint8_t bit23 : 1;
-    uint8_t bit24 : 1;
-    uint8_t res2 : 7;
-#if defined(__aarch64__)
-    uint32_t res3;
-#endif
-} fpcr_bitfield;
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-    float32x2_t a21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-    float32x2_t a03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-    float32x2_t a33 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
-{
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
-}
-
-#if defined(__ARM_FEATURE_CRYPTO) && \
-    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else  // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-                                    vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-                                    vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
-    uint8x16_t e =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
-    uint8x16_t f =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
-    uint8x16_t g =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
-    uint8x16_t h =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
-    uint8x16_t i =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
-    uint8x16_t j =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
-    uint8x16_t k =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f);  // L = E + F
-    uint8x16_t m = veorq_u8(g, h);  // M = G + H
-    uint8x16_t n = veorq_u8(i, j);  // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix = veorq_u8(d, cross1);
-    uint8x16_t r = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
-}
-#endif  // ARMv7 polyfill
-
-// C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
-//       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-#define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most significant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least significant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
-#else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
-#endif
-
-// NEON does not support a general purpose permute intrinsic.
-// Shuffle single-precision (32-bit) floating-point elements in a using the
-// control in imm8, and store the results in dst.
-//
-// C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
-//       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
-    __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
-    })
-
-// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
-// Store the results in the low 64 bits of dst, with the high 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
-#define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-        int16x4_t lowBits = vget_low_s16(ret);                                \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
-                             1);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
-                             2);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
-                             3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
-
-// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
-// Store the results in the high 64 bits of dst, with the low 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
-#define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-        int16x4_t highBits = vget_high_s16(ret);                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
-                             5);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
-                             6);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
-                             7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
-
-/* MMX */
-
-//_mm_empty is a no-op on arm
-FORCE_INLINE void _mm_empty(void) {}
-
-/* SSE */
-
-// Add packed single-precision (32-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Add the lower single-precision (32-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-    // the upper values in the result must be the remnants of <a>.
-    return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Compute the bitwise AND of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vbicq_s32(vreinterpretq_s32_m128(b),
-                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(
-        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmple_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
-//
-// See also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
-    // Note: NEON does not have ordered compare builtin
-    // Need to compare a eq a and b eq b to check for NaN
-    // Do AND of results to get final
-    uint32x4_t ceqaa =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t ceqbb =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-{
-    uint32x4_t f32a =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t f32b =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_eq_b =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_ge_b =
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_gt_b =
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_le_b =
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_le_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_lt_b =
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-    return !_mm_comieq_ss(a, b);
-}
-
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
-FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
-#else
-    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
-#endif
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
-                          0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int32_t) data;
-#endif
-}
-
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
-}
-
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then convert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
-}
-
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 16-bit integers, and store the results in dst. Note: this intrinsic
-// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
-// 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
-FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
-{
-    return vreinterpret_m64_s16(
-        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
-#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 8-bit integers, and store the results in lower 4 elements of dst.
-// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
-// between 0x7F and 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
-FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
-{
-    return vreinterpret_m64_s8(vqmovn_s16(
-        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
-}
-
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
-}
-
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_u32(
-        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
-#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
-
-// Convert the signed 64-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
-FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
-#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
-FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int64_t) data;
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
-FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
-{
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
-FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
-{
-    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
-#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
-#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
-FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
-{
-    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Divide packed single-precision (32-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#endif
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
-#endif
-}
-
-// Divide the lower single-precision (32-bit) floating-point element in a by the
-// lower single-precision (32-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper 3 packed elements from a to
-// the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
-#define _mm_extract_pi16(a, imm) \
-    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
-
-// Free aligned memory that was allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void _mm_free(void *addr)
-{
-    free(addr);
-}
-#endif
-
-// Macro: Get the flush zero bits from the MXCSR control and status register.
-// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
-// _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
-FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
-}
-
-// Macro: Get the rounding mode bits from the MXCSR control and status register.
-// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
-// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
-    }
-}
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm)                               \
-    __extension__({                                              \
-        vreinterpret_m64_s16(                                    \
-            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[63:32] := MEM[mem_addr+31:mem_addr]
-//   dst[95:64] := MEM[mem_addr+31:mem_addr]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
-
-// Load a single-precision (32-bit) floating-point element from memory into the
-// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
-}
-
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
-{
-    float32x4_t v = vrev64q_f32(vld1q_f32(p));
-    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
-}
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
-    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-    // equivalent for neon
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load unaligned 16-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
-{
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
-}
-
-// Load unaligned 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
-}
-
-// Allocate size bytes of memory, aligned to the alignment specified in align,
-// and return a pointer to the allocated memory. _mm_free should be used to free
-// memory that is allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-    void *ptr;
-    if (align == 1)
-        return malloc(size);
-    if (align == 2 || (sizeof(void *) == 8 && align == 4))
-        align = sizeof(void *);
-    if (!posix_memalign(&ptr, align, size))
-        return ptr;
-    return NULL;
-}
-#endif
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
-FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
-{
-    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x8_t masked =
-        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
-                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
-    vst1_s8((int8_t *) mem_addr, masked);
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
-#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed maximum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed minimum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the lower single-precision (32-bit) floating-point element from b to the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
-                       vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the upper 2 single-precision (32-bit) floating-point elements from b to
-// the lower 2 elements of dst, and copy the upper 2 elements from a to the
-// upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
-{
-#if defined(aarch64__)
-    return vreinterpretq_m128_u64(
-        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
-#else
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-#endif
-}
-
-// Move the lower 2 single-precision (32-bit) floating-point elements from b to
-// the upper 2 elements of dst, and copy the lower 2 elements from a to the
-// lower 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
-FORCE_INLINE int _mm_movemask_pi8(__m64 a)
-{
-    uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__)
-    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint8x8_t tmp = vshr_n_u8(input, 7);
-    return vaddv_u8(vshl_u8(tmp, shift));
-#else
-    // Refer the implementation of `_mm_movemask_epi8`
-    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
-    uint32x2_t paired16 =
-        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
-    uint8x8_t paired32 =
-        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
-    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
-#endif
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed single-precision (32-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-    uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
-    uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
-#else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Multiply packed single-precision (32-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_mul_ps(a, b));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(vshrn_n_u32(
-        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
-}
-
-// Compute the bitwise OR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
-#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
-#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
-#define _m_pmovmskb(a) _mm_movemask_pi8(a)
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
-// Fetch the line of data from memory that contains address p to a location in
-// the cache heirarchy specified by the locality hint i.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
-FORCE_INLINE void _mm_prefetch(char const *p, int i)
-{
-    switch (i) {
-    case _MM_HINT_NTA:
-        __builtin_prefetch(p, 0, 0);
-        break;
-    case _MM_HINT_T0:
-        __builtin_prefetch(p, 0, 3);
-        break;
-    case _MM_HINT_T1:
-        __builtin_prefetch(p, 0, 2);
-        break;
-    case _MM_HINT_T2:
-        __builtin_prefetch(p, 0, 1);
-        break;
-    }
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
-#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
-
-// Compute the approximate reciprocal of packed single-precision (32-bit)
-// floating-point elements in a, and store the results in dst. The maximum
-// relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#endif
-    return vreinterpretq_m128_f32(recip);
-}
-
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
-    return _mm_move_ss(a, _mm_rcp_ps(a));
-}
-
-// Compute the approximate reciprocal square root of packed single-precision
-// (32-bit) floating-point elements in a, and store the results in dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-#if SSE2NEON_PRECISE_SQRT
-    // Additional Netwon-Raphson iteration for accuracy
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-#endif
-    return vreinterpretq_m128_f32(out);
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
-{
-    uint64x1_t t = vpaddl_u32(vpaddl_u16(
-        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
-    return vreinterpret_m64_u16(
-        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
-}
-
-// Macro: Set the flush zero bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The flush zero may contain any of the
-// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
-FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Macro: Set the rounding mode bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The rounding mode may contain any of
-// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
-// _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
-        break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
-    }
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Set the MXCSR control and status register with the value in unsigned 32-bit
-// integer a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
-// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
-FORCE_INLINE void _mm_setcsr(unsigned int a)
-{
-    _MM_SET_ROUNDING_MODE(a);
-}
-
-// Get the unsigned 32-bit value of the MXCSR control and status register.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
-// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
-FORCE_INLINE unsigned int _mm_getcsr()
-{
-    return _MM_GET_ROUNDING_MODE();
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Return vector of type __m128 with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                           \
-    __extension__({                                                        \
-        vreinterpret_m64_s16(vshuffle_s16(                                 \
-            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
-            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
-    })
-#else
-#define _mm_shuffle_pi16(a, imm)                                               \
-    __extension__({                                                            \
-        int16x4_t ret;                                                         \
-        ret =                                                                  \
-            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
-            1);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
-            2);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
-            3);                                                                \
-        vreinterpret_m64_s16(ret);                                             \
-    })
-#endif
-
-// Perform a serializing operation on all store-to-memory instructions that were
-// issued prior to this instruction. Guarantees that every store instruction
-// that precedes, in program order, is globally visible before any store
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
-FORCE_INLINE void _mm_sfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory and store-to-memory
-// instructions that were issued prior to this instruction. Guarantees that
-// every memory access that precedes, in program order, the memory fence
-// instruction is globally visible before any memory instruction which follows
-// the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
-FORCE_INLINE void _mm_mfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory instructions that
-// were issued prior to this instruction. Guarantees that every load instruction
-// that precedes, in program order, is globally visible before any load
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
-FORCE_INLINE void _mm_lfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_ps(a, b, imm)                                              \
-    __extension__({                                                            \
-        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
-        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
-        float32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-        vreinterpretq_m128_f32(_shuf);                                         \
-    })
-#else  // generic
-#define _mm_shuffle_ps(a, b, imm)                          \
-    __extension__({                                        \
-        __m128 ret;                                        \
-        switch (imm) {                                     \
-        case _MM_SHUFFLE(1, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_1032((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 3, 0, 1):                      \
-            ret = _mm_shuffle_ps_2301((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 3, 2, 1):                      \
-            ret = _mm_shuffle_ps_0321((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 1, 0, 3):                      \
-            ret = _mm_shuffle_ps_2103((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 1, 0):                      \
-            ret = _mm_movelh_ps((a), (b));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_1001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 1, 0, 1):                      \
-            ret = _mm_shuffle_ps_0101((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 1, 0):                      \
-            ret = _mm_shuffle_ps_3210((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 1, 1):                      \
-            ret = _mm_shuffle_ps_0011((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 2, 2):                      \
-            ret = _mm_shuffle_ps_0022((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 2, 0, 0):                      \
-            ret = _mm_shuffle_ps_2200((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 0, 2):                      \
-            ret = _mm_shuffle_ps_3202((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 3, 2):                      \
-            ret = _mm_movehl_ps((b), (a));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 1, 3, 3):                      \
-            ret = _mm_shuffle_ps_1133((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 1, 0):                      \
-            ret = _mm_shuffle_ps_2010((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_2001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_2032((a), (b));           \
-            break;                                         \
-        default:                                           \
-            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-            break;                                         \
-        }                                                  \
-        ret;                                               \
-    })
-#endif
-
-// Compute the square root of packed single-precision (32-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if SSE2NEON_PRECISE_SQRT
-    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-
-    // Test for vrsqrteq_f32(0) -> positive infinity case.
-    // Change to zero, so that s * 1/sqrt(s) result is zero too.
-    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
-    const uint32x4_t div_by_zero =
-        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
-    recip = vreinterpretq_f32_u32(
-        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
-
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-
-    // sqrt(s) = s * 1/sqrt(s)
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
-#elif defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Compute the square root of the lower single-precision (32-bit) floating-point
-// element in a, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
-FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    vst1q_f32(p, vdupq_n_f32(a0));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
-#define _mm_store1_ps _mm_store_ps1
-
-// Store the upper 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_high_f32(a));
-}
-
-// Store the lower 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_low_f32(a));
-}
-
-// Store 4 single-precision (32-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
-FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
-{
-    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-    float32x4_t rev = vextq_f32(tmp, tmp, 2);
-    vst1q_f32(p, rev);
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores 16-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
-FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
-{
-    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
-}
-
-// Stores 64-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
-FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
-{
-    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
-}
-
-// Store 64-bits of integer data from a into memory using a non-temporal memory
-// hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
-FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
-{
-    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#else
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-#endif
-}
-
-// Subtract packed single-precision (32-bit) floating-point elements in b from
-// packed single-precision (32-bit) floating-point elements in a, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_sub_ps(a, b));
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-    do {                                                  \
-        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
-        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
-                            vget_low_f32(ROW23.val[0]));  \
-        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
-                            vget_low_f32(ROW23.val[1]));  \
-        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
-                            vget_high_f32(ROW23.val[0])); \
-        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
-                            vget_high_f32(ROW23.val[1])); \
-    } while (0)
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-// Return vector of type __m128i with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
-FORCE_INLINE __m128i _mm_undefined_si128(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128i a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Return vector of type __m128 with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
-FORCE_INLINE __m128 _mm_undefined_ps(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128 a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the high half a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-/* SSE2 */
-
-// Add packed 16-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed 32-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Add packed 64-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Add packed 8-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add the lower double-precision (64-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper element from
-// a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
-FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_add_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add 64-bit integers a and b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Add packed signed 16-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed unsigned 16-bit integers in a and b using saturation, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Add packed unsigned 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-{
-    // *NOTE* argument swap
-    return vreinterpretq_m128d_s64(
-        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
-}
-
-// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
-// AND with b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vbicq_s32(vreinterpretq_s32_m128i(b),
-                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
-#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
-#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
-
-// Cast vector of type __m128d to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
-FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
-{
-    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
-    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
-{
-    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
-FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
-#else
-    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
-#endif
-}
-
-// Cast vector of type __m128i to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Invalidate and flush the cache line that contains p from all levels of the
-// cache hierarchy.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
-#if defined(__APPLE__)
-#include <libkern/OSCacheControl.h>
-#endif
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-    (void) p;
-
-    /* sys_icache_invalidate is supported since macOS 10.5.
-     * However, it does not work on non-jailbroken iOS devices, although the
-     * compilation is successful.
-     */
-#if defined(__APPLE__)
-    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
-#elif defined(__GNUC__) || defined(__clang__)
-    uintptr_t ptr = (uintptr_t) p;
-    __builtin___clear_cache((char *) ptr,
-                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-// Compare packed 16-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 8-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
-FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
-FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
-FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
-FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
-FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
-FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
-FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
-FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmple_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
-FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
-FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
-FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
-FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
-FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
-FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
-FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
-FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
-FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
-FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
-FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
-FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
-FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Excluding NaNs, any two floating point numbers can be compared.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
-FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
-FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Two NaNs are not equal in comparison operation.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_s32(
-        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
-FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
-FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
-FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 > *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
-FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 <= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
-FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 < *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
-FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
-#else
-    uint32x4_t a_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
-    uint32x4_t b_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
-                                       vreinterpretq_u64_u32(a_eq_b));
-    return vgetq_lane_u64(and_results, 0) & 0x1;
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
-FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
-{
-    return !_mm_comieq_sd(a, b);
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
-FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
-#else
-    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
-FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
-{
-// vrnd32xq_f64 not supported on clang
-#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
-    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
-    int64x2_t integers = vcvtq_s64_f64(rounded);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
-FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
-{
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-{
-#if defined(__aarch64__)
-    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
-    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
-#else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
-FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
-#else
-    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
-    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__ARM_FEATURE_FRINT)
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST:
-        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-    case _MM_ROUND_DOWN:
-        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
-    case _MM_ROUND_UP:
-        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
-    }
-#else
-    float *f = (float *) &a;
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST: {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128i_s32(
-            vbslq_s32(is_delta_half, r_even, r_normal));
-    }
-    case _MM_ROUND_DOWN:
-        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
-                             floorf(f[0]));
-    case _MM_ROUND_UP:
-        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
-                             ceilf(f[0]));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
-                             (int32_t) f[0]);
-    }
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
-#else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
-#else
-    return ((double *) &a)[0];
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
-FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int32_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
-FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
-#define _mm_cvtsd_si64x _mm_cvtsd_si64
-
-// Convert the lower double-precision (64-bit) floating-point element in b to a
-// single-precision (32-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
-FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsetq_lane_f32(
-        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
-        vreinterpretq_f32_m128(a), 0));
-#else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
-#endif
-}
-
-// Copy the lower 32-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
-{
-    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Convert the signed 32-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
-FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Copy 32-bit integer a to the lower elements of dst, and zero the upper
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
-FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
-    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
-#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
-#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
-
-// Convert the lower single-precision (32-bit) floating-point element in b to a
-// double-precision (64-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
-FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
-{
-    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
-FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
-FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
-FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
-{
-    double ret = *((double *) &a);
-    return (int32_t) ret;
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
-FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
-#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
-// Divide packed double-precision (64-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
-FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Divide the lower double-precision (64-bit) floating-point element in a by the
-// lower double-precision (64-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper element from a to the upper
-// element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
-FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    float64x2_t tmp =
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
-#else
-    return _mm_move_sd(a, _mm_div_pd(a, b));
-#endif
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64(p));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
-#else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
-#endif
-}
-
-// Load 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
-    /* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
-#else
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-#endif
-}
-
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
-#if defined(__aarch64__)
-    float64x2_t v = vld1q_f64(p);
-    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
-#else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
-    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
-#endif
-}
-
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
-{
-    return _mm_load_pd(p);
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load unaligned 32-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
-{
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
-// 32-bit integers, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
-    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                              vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__)
-    int32x4_t high =
-        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
-
-    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
-#else
-    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                               vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-#endif
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint. mem_addr does not need to be aligned
-// on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
-FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
-{
-    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x16_t masked =
-        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
-                 vreinterpretq_s8_m128(b));
-    vst1q_s8((int8_t *) mem_addr, masked);
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed maximum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
-FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
-FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_max_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed minimum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
-FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
-FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_min_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
-}
-
-// Move the lower double-precision (64-bit) floating-point element from b to the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
-FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-    // Shift out everything but the sign bits with an unsigned shift right.
-    //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
-    //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
-    //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
-    //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
-    //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
-    //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed double-precision (64-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
-FORCE_INLINE int _mm_movemask_pd(__m128d a)
-{
-    uint64x2_t input = vreinterpretq_u64_m128d(a);
-    uint64x2_t high_bits = vshrq_n_u64(input, 63);
-    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-{
-    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-}
-
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
-    // vmull_u32 upcasts instead of masking, so we downcast.
-    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply packed double-precision (64-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
-FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Multiply the lower double-precision (64-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper element
-// from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
-FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_mul_pd(a, b));
-}
-
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u64(vget_low_u64(
-        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
-}
-
-// Multiply the packed signed 16-bit integers in a and b, producing intermediate
-// 32-bit integers, and store the high 16 bits of the intermediate integers in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-    /* FIXME: issue with large values because of result saturation */
-    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
-FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
-{
-    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__)
-    uint32x4_t ab7654 =
-        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
-                              vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r);
-#else
-    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-#endif
-}
-
-// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
-// integers, and store the low 16 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compute the bitwise OR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
-FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Pause the processor. This is typically used in spin-wait loops and depending
-// on the x86 processor typical values are in the 40-100 cycle range. The
-// 'yield' instruction isn't a good fit because it's effectively a nop on most
-// Arm cores. Experience with several databases has shown has shown an 'isb' is
-// a reasonable approximation.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
-FORCE_INLINE void _mm_pause()
-{
-    __asm__ __volatile__("isb\n");
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
-}
-
-// Set packed 16-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                   short i6,
-                                   short i5,
-                                   short i4,
-                                   short i3,
-                                   short i2,
-                                   short i1,
-                                   short i0)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-    return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
-{
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
-}
-
-// Set packed 8-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                  signed char b14,
-                                  signed char b13,
-                                  signed char b12,
-                                  signed char b11,
-                                  signed char b10,
-                                  signed char b9,
-                                  signed char b8,
-                                  signed char b7,
-                                  signed char b6,
-                                  signed char b5,
-                                  signed char b4,
-                                  signed char b3,
-                                  signed char b2,
-                                  signed char b1,
-                                  signed char b0)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-{
-    double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
-#define _mm_set_pd1 _mm_set1_pd
-
-// Copy double-precision (64-bit) floating-point element a to the lower element
-// of dst, and zero the upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
-FORCE_INLINE __m128d _mm_set_sd(double a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
-#else
-    return _mm_set_pd(0, a);
-#endif
-}
-
-// Broadcast 16-bit integer a to all all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Broadcast 32-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Broadcast 8-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
-    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
-FORCE_INLINE __m128d _mm_set1_pd(double d)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
-#endif
-}
-
-// Set packed 16-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                    short w1,
-                                    short w2,
-                                    short w3,
-                                    short w4,
-                                    short w5,
-                                    short w6,
-                                    short w7)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
-{
-    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
-}
-
-// Set packed 8-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                   signed char b1,
-                                   signed char b2,
-                                   signed char b3,
-                                   signed char b4,
-                                   signed char b5,
-                                   signed char b6,
-                                   signed char b7,
-                                   signed char b8,
-                                   signed char b9,
-                                   signed char b10,
-                                   signed char b11,
-                                   signed char b12,
-                                   signed char b13,
-                                   signed char b14,
-                                   signed char b15)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
-FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
-{
-    return _mm_set_pd(e0, e1);
-}
-
-// Return vector of type __m128d with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
-FORCE_INLINE __m128d _mm_setzero_pd(void)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
-#else
-    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
-#endif
-}
-
-// Return vector of type __m128i with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Shuffle 32-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_epi32(a, imm)                                            \
-    __extension__({                                                          \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
-        int32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                                      \
-    })
-#else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
-#endif
-
-// Shuffle double-precision (64-bit) floating-point elements using the control
-// in imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pd(a, b, imm8)                                            \
-    vreinterpretq_m128d_s64(                                                  \
-        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
-#else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflehi_epi16(a, imm)                                           \
-    __extension__({                                                           \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
-        int16x8_t _shuf =                                                     \
-            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-                          (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                                       \
-    })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflelo_epi16(a, imm)                                  \
-    __extension__({                                                  \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = vshuffleq_s16(                             \
-            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
-            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
-        vreinterpretq_m128i_s16(_shuf);                              \
-    })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shift packed 16-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
-    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
-    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
-    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
-FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~15))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
-}
-
-// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~31))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s32(
-        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
-}
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~63))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s64(
-        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                         \
-    __extension__({                                                    \
-        int8x16_t ret;                                                 \
-        if (_sse2neon_unlikely(imm == 0))                              \
-            ret = vreinterpretq_s8_m128i(a);                           \
-        else if (_sse2neon_unlikely((imm) & ~15))                      \
-            ret = vdupq_n_s8(0);                                       \
-        else                                                           \
-            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
-                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
-        vreinterpretq_m128i_s8(ret);                                   \
-    })
-
-// Compute the square root of packed double-precision (64-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
-FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Compute the square root of the lower double-precision (64-bit) floating-point
-// element in b, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
-FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_sqrt_pd(b));
-#else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
-#endif
-}
-
-// Shift packed 16-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in sign
-// bits, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-{
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
-}
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) == 0)) {                                \
-            ret = a;                                                         \
-        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 16-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
-    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
-    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
-    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~15)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u16(                                   \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~31)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u32(                                   \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~63)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u64(                                   \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
-#define _mm_srli_si128(a, imm)                                       \
-    __extension__({                                                  \
-        int8x16_t ret;                                               \
-        if (_sse2neon_unlikely((imm) & ~15))                         \
-            ret = vdupq_n_s8(0);                                     \
-        else                                                         \
-            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
-                           (imm > 15 ? 0 : imm));                    \
-        vreinterpretq_m128i_s8(ret);                                 \
-    })
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
-FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
-    vst1q_f64((float64_t *) mem_addr,
-              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
-#else
-    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
-    vst1q_f32((float32_t *) mem_addr,
-              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
-FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
-#define _mm_store1_pd _mm_store_pd1
-
-// Store the upper double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
-FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 64-bit integer from the first element of a into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
-    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
-FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 2 double-precision (64-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
-FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
-{
-    float32x4_t f = vreinterpretq_f32_m128d(a);
-    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
-{
-    _mm_store_pd(mem_addr, a);
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store 32-bit integer from the first element of a into memory. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
-FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
-{
-    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory using a non-temporal memory hint. mem_addr must
-// be aligned on a 16-byte boundary or a general-protection exception may be
-// generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
-FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#elif defined(__aarch64__)
-    vst1q_f64(p, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory using a non-temporal memory
-// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
-// exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, p);
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Store 32-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
-FORCE_INLINE void _mm_stream_si32(int *p, int a)
-{
-    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
-}
-
-// Store 64-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
-FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
-{
-    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
-}
-
-// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed double-precision (64-bit) floating-point elements in b from
-// packed double-precision (64-bit) floating-point elements in a, and store the
-// results in dst.
-//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
-FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Subtract the lower double-precision (64-bit) floating-point element in b from
-// the lower double-precision (64-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
-FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_sub_pd(a, b));
-}
-
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-#define _mm_ucomieq_sd _mm_comieq_sd
-#define _mm_ucomige_sd _mm_comige_sd
-#define _mm_ucomigt_sd _mm_comigt_sd
-#define _mm_ucomile_sd _mm_comile_sd
-#define _mm_ucomilt_sd _mm_comilt_sd
-#define _mm_ucomineq_sd _mm_comineq_sd
-
-// Return vector of type __m128d with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
-FORCE_INLINE __m128d _mm_undefined_pd(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128d a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the high half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the high half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
-FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
-                     vget_high_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
-FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
-                     vget_low_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-/* SSE3 */
-
-// Alternatively add and subtract packed double-precision (64-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
-FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
-{
-    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
-                                             vreinterpretq_f64_m128d(b),
-                                             vreinterpretq_f64_m128d(mask)));
-#else
-    return _mm_add_pd(_mm_mul_pd(b, mask), a);
-#endif
-}
-
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-{
-    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
-                                            vreinterpretq_f32_m128(mask),
-                                            vreinterpretq_f32_m128(b)));
-#else
-    return _mm_add_ps(_mm_mul_ps(b, mask), a);
-#endif
-}
-
-// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
-FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of double-precision (64-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
-{
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
-#else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
-{
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
-#else
-    float32x4x2_t c = vuzpq_f32(a, b);
-    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
-}
-
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
-
-// Duplicate the low double-precision (64-bit) floating-point element from a,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
-FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_u64(
-        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
-#endif
-}
-
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
-#else
-    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
-    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
-    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-/* SSSE3 */
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
-    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
-    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-{
-    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-{
-    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
-    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
-
-// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 16 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
-        __m128i ret;                                                          \
-        if (_sse2neon_unlikely((imm) & ~31))                                  \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
-        else if (imm >= 16)                                                   \
-            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
-        else                                                                  \
-            ret =                                                             \
-                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
-        ret;                                                                  \
-    })
-
-// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 8 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
-#define _mm_alignr_pi8(a, b, imm)                                           \
-    __extension__({                                                         \
-        __m64 ret;                                                          \
-        if (_sse2neon_unlikely((imm) >= 16)) {                              \
-            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
-        } else {                                                            \
-            uint8x8_t tmp_low, tmp_high;                                    \
-            if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
-                tmp_low = vreinterpret_u8_m64(a);                           \
-                tmp_high = vdup_n_u8(0);                                    \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            } else {                                                        \
-                const int idx = (imm);                                      \
-                tmp_low = vreinterpret_u8_m64(b);                           \
-                tmp_high = vreinterpret_u8_m64(a);                          \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            }                                                               \
-        }                                                                   \
-        ret;                                                                \
-    })
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
-#else
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s32(
-        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
-FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t res = vuzp_s16(a, b);
-    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
-#else
-    int32x4x2_t c = vuzpq_s32(a, b);
-    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
-FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
-FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
-#else
-    int32x2x2_t c = vuzp_s32(a, b);
-    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
-FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                             vmovl_s8(vget_low_s8(b)));
-    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
-                             vmovl_s8(vget_high_s8(b)));
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
-#else
-    // This would be much simpler if x86 would choose to zero extend OR sign
-    // extend, not both. This could probably be optimized better.
-    uint16x8_t a = vreinterpretq_u16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // Zero extend a
-    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-    int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-    // multiply
-    int16x8_t prod1 = vmulq_s16(a_even, b_even);
-    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
-// pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
-FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
-{
-    uint16x4_t a = vreinterpret_u16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // Zero extend a
-    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
-    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
-    int16x4_t b_odd = vshr_n_s16(b, 8);
-
-    // multiply
-    int16x4_t prod1 = vmul_s16(a_even, b_even);
-    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
-    // Has issues due to saturation
-    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-    // Multiply
-    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    // Rounding narrowing shift right
-    // narrow = (int16_t)((mul + 16384) >> 15);
-    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-    // Join together
-    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Truncate each intermediate integer to the 18 most
-// significant bits, round by adding 1, and store bits [16:1] to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
-FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
-{
-    int32x4_t mul_extend =
-        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
-
-    // Rounding narrowing shift right
-    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
-    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
-    uint8x16_t idx_masked =
-        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-    int8x16_t ret;
-    // %e and %f represent the even and odd D registers
-    // respectively.
-    __asm__ __volatile__(
-        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-        : [ret] "=&w"(ret)
-        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-    return vreinterpretq_m128i_s8(ret);
-#else
-    // use this line if testing on aarch64
-    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
-FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
-{
-    const int8x8_t controlMask =
-        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
-    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
-    return vreinterpret_m64_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
-#else
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
-    // 'a') based on ltMask
-    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x8_t res = vbicq_s16(masked, zeroMask);
-    return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
-#else
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
-    // 'a') based on ltMask
-    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x4_t res = vbicq_s32(masked, zeroMask);
-    return vreinterpretq_m128i_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
-    int8x16_t a = vreinterpretq_s8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
-#else
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
-    // based on ltMask
-    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x16_t res = vbicq_s8(masked, zeroMask);
-
-    return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
-
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
-#else
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
-    // based on ltMask
-    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x4_t res = vbic_s16(masked, zeroMask);
-
-    return vreinterpret_m64_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
-#else
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
-    // based on ltMask
-    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x2_t res = vbic_s32(masked, zeroMask);
-
-    return vreinterpret_m64_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-{
-    int8x8_t a = vreinterpret_s8_m64(_a);
-    int8x8_t b = vreinterpret_s8_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
-#else
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
-    // based on ltMask
-    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x8_t res = vbic_s8(masked, zeroMask);
-
-    return vreinterpret_m64_s8(res);
-}
-
-/* SSE4.1 */
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                            \
-    __extension__({                                                           \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
-    })
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using control mask imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
-#define _mm_blend_pd(a, b, imm)                                \
-    __extension__({                                            \
-        const uint64_t _mask[2] = {                            \
-            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
-            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
-        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
-        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
-        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
-        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
-    })
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
-FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
-{
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
-    uint32x4_t mask = vld1q_u32(data);
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint8x16_t mask =
-        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    uint8x16_t b = vreinterpretq_u8_m128i(_b);
-    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
-FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
-{
-    uint64x2_t mask =
-        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
-#else
-    uint64x2_t a = vreinterpretq_u64_m128d(_a);
-    uint64x2_t b = vreinterpretq_u64_m128d(_b);
-    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
-#endif
-}
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint32x4_t mask =
-        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a up
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b up to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
-FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_ceil_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b up to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
-FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_ceil_ps(b));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
-    // ARMv7 lacks vceqq_u64
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
-    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
-// integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_u32(
-        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
-    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_u64(
-        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
-    return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
-// 64-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Conditionally multiply the packed double-precision (64-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products, and
-// conditionally store the sum in dst using the low 4 bits of imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
-FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
-{
-    // Generate mask value from constant immediate bit value
-    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
-    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
-#if !SSE2NEON_PRECISE_DP
-    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
-    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
-#endif
-    // Conditional multiplication
-#if !SSE2NEON_PRECISE_DP
-    __m128d mul = _mm_mul_pd(a, b);
-    const __m128d mulMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
-    __m128d tmp = _mm_and_pd(mul, mulMask);
-#else
-#if defined(__aarch64__)
-    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
-                             : 0;
-    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
-                             : 0;
-#else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
-#endif
-    __m128d tmp = _mm_set_pd(d1, d0);
-#endif
-    // Sum the products
-#if defined(__aarch64__)
-    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
-#else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
-#endif
-    // Conditionally store the sum
-    const __m128d sumMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
-    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
-    return res;
-}
-
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-{
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
-    if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
-    }
-#endif
-
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
-
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
-    if (imm & (1 << 4))
-        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
-    if (imm & (1 << 5))
-        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
-    if (imm & (1 << 6))
-        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
-    if (imm & (1 << 7))
-        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
-    };
-    return vreinterpretq_m128_f32(res);
-}
-
-// Extract a 32-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extract a 64-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Extract an 8-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
-// __constrange(0,16) int imm)
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Round the packed double-precision (64-bit) floating-point elements in a down
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
-FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b down to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
-FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_floor_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b down to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
-FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_floor_ps(b));
-}
-
-// Copy a to dst, and insert the 32-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the 64-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
-// location specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
-
-// Copy a to tmp, then insert a single-precision (32-bit) floating-point
-// element from b into tmp using the control in imm8. Store tmp to dst using
-// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                              \
-    __extension__({                                                            \
-        float32x4_t tmp1 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
-                           vreinterpretq_f32_m128(a), 0);                      \
-        float32x4_t tmp2 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
-                           ((imm8 >> 4) & 0x3));                               \
-        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
-        uint32x4_t mask = vld1q_u32(data);                                     \
-        float32x4_t all_zeros = vdupq_n_f32(0);                                \
-                                                                               \
-        vreinterpretq_m128_f32(                                                \
-            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
-    })
-
-// Compare packed signed 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
-FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
-FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
-FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
-    __m128i dst;
-    uint16_t min, idx = 0;
-#if defined(__aarch64__)
-    // Find the minimum value
-    min = vminvq_u16(vreinterpretq_u16_m128i(a));
-
-    // Get the index of the minimum value
-    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint16x8_t minv = vdupq_n_u16(min);
-    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
-    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
-#else
-    // Find the minimum value
-    __m64 tmp;
-    tmp = vreinterpret_m64_u16(
-        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-                 vget_high_u16(vreinterpretq_u16_m128i(a))));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-    // Get the index of the minimum value
-    int i;
-    for (i = 0; i < 8; i++) {
-        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-            idx = (uint16_t) i;
-            break;
-        }
-        a = _mm_srli_si128(a, 2);
-    }
-#endif
-    // Generate result
-    dst = _mm_setzero_si128();
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-    return dst;
-}
-
-// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
-// 8-bit integers in a compared to those in b, and store the 16-bit results in
-// dst. Eight SADs are performed using one quadruplet from b and eight
-// quadruplets from a. One quadruplet is selected from b starting at on the
-// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
-// integers selected from a starting at the offset specified in imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
-FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
-{
-    uint8x16_t _a, _b;
-
-    switch (imm & 0x4) {
-    case 0:
-        // do nothing
-        _a = vreinterpretq_u8_m128i(a);
-        break;
-    case 4:
-        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
-                                            vreinterpretq_u32_m128i(a), 1));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    switch (imm & 0x3) {
-    case 0:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
-        break;
-    case 1:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
-        break;
-    case 2:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
-        break;
-    case 3:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    int16x8_t c04, c15, c26, c37;
-    uint8x8_t low_b = vget_low_u8(_b);
-    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
-    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
-    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
-    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
-    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
-    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
-    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__)
-    // |0|4|2|6|
-    c04 = vpaddq_s16(c04, c26);
-    // |1|5|3|7|
-    c15 = vpaddq_s16(c15, c37);
-
-    int32x4_t trn1_c =
-        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    int32x4_t trn2_c =
-        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
-                                              vreinterpretq_s16_s32(trn2_c)));
-#else
-    int16x4_t c01, c23, c45, c67;
-    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
-    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
-    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
-    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
-
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
-#endif
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
-    // vmull_s32 upcasts instead of masking, so we downcast.
-    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
-// integers, and store the low 32 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
-FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
-{
-#if defined(__aarch64__)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_pd(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_pd(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
-    }
-#else
-    double *v_double = (double *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        double res[2], tmp;
-        for (int i = 0; i < 2; i++) {
-            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
-            double roundDown = floor(tmp);  // Round down value
-            double roundUp = ceil(tmp);     // Round up value
-            double diffDown = tmp - roundDown;
-            double diffUp = roundUp - tmp;
-            if (diffDown < diffUp) {
-                /* If it's closer to the round down value, then use it */
-                res[i] = roundDown;
-            } else if (diffDown > diffUp) {
-                /* If it's closer to the round up value, then use it */
-                res[i] = roundUp;
-            } else {
-                /* If it's equidistant between round up and round down value,
-                 * pick the one which is an even number */
-                double half = roundDown / 2;
-                if (half != floor(half)) {
-                    /* If the round down value is odd, return the round up value
-                     */
-                    res[i] = roundUp;
-                } else {
-                    /* If the round up value is odd, return the round down value
-                     */
-                    res[i] = roundDown;
-                }
-            }
-            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
-        }
-        return _mm_set_pd(res[1], res[0]);
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_pd(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_pd(a);
-    }
-    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
-                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
-// floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_ps(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_ps(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
-    }
-#else
-    float *v_float = (float *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128_f32(
-            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_ps(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_ps(a);
-    }
-    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
-                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
-                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
-                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b using
-// the rounding parameter, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
-FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
-{
-    return _mm_move_sd(a, _mm_round_pd(b, rounding));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b using
-// the rounding parameter, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst. Rounding is done according to the
-// rounding[3:0] parameter, which can be one of:
-//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
-//     exceptions
-//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
-//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
-//     _MM_SET_ROUNDING_MODE
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
-FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
-{
-    return _mm_move_ss(a, _mm_round_ps(b, rounding));
-}
-
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    return __builtin_nontemporal_load(p);
-#else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
-#endif
-}
-
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
-{
-    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
-    int64x2_t a_and_mask =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
-    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
-// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
-// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
-FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
-{
-    uint64x2_t zf =
-        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t cf =
-        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t result = vandq_u64(zf, cf);
-    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the CF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
-FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
-#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the ZF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
-FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-/* SSE4.2 */
-
-const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-
-/* specify the source data format */
-#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
-#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
-#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
-#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
-
-/* specify the comparison operation */
-#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
-#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
-#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
-#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
-
-/* specify the polarity */
-#define _SIDD_POSITIVE_POLARITY 0x00
-#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
-#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
-#define _SIDD_MASKED_NEGATIVE_POLARITY \
-    0x30 /* negate results only before end of string */
-
-/* specify the output selection in _mm_cmpXstri */
-#define _SIDD_LEAST_SIGNIFICANT 0x00
-#define _SIDD_MOST_SIGNIFICANT 0x40
-
-/* specify the output selection in _mm_cmpXstrm */
-#define _SIDD_BIT_MASK 0x00
-#define _SIDD_UNIT_MASK 0x40
-
-/* Pattern Matching for C macros.
- * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
- */
-
-/* catenate */
-#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
-#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
-
-#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
-/* run the 2nd parameter */
-#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
-/* run the 1st parameter */
-#define SSE2NEON_IIF_1(t, ...) t
-
-#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
-#define SSE2NEON_COMPL_0 1
-#define SSE2NEON_COMPL_1 0
-
-#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
-#define SSE2NEON_DEC_1 0
-#define SSE2NEON_DEC_2 1
-#define SSE2NEON_DEC_3 2
-#define SSE2NEON_DEC_4 3
-#define SSE2NEON_DEC_5 4
-#define SSE2NEON_DEC_6 5
-#define SSE2NEON_DEC_7 6
-#define SSE2NEON_DEC_8 7
-#define SSE2NEON_DEC_9 8
-#define SSE2NEON_DEC_10 9
-#define SSE2NEON_DEC_11 10
-#define SSE2NEON_DEC_12 11
-#define SSE2NEON_DEC_13 12
-#define SSE2NEON_DEC_14 13
-#define SSE2NEON_DEC_15 14
-#define SSE2NEON_DEC_16 15
-
-/* detection */
-#define SSE2NEON_CHECK_N(x, n, ...) n
-#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
-#define SSE2NEON_PROBE(x) x, 1,
-
-#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
-#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
-
-#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
-#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
-
-#define SSE2NEON_EAT(...)
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
-
-/* recursion */
-/* deferred expression */
-#define SSE2NEON_EMPTY()
-#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
-#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-
-#define SSE2NEON_EVAL(...) \
-    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
-#define SSE2NEON_EVAL1(...) \
-    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
-#define SSE2NEON_EVAL2(...) \
-    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
-#define SSE2NEON_EVAL3(...) __VA_ARGS__
-
-#define SSE2NEON_REPEAT(count, macro, ...)                         \
-    SSE2NEON_WHEN(count)                                           \
-    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
-        SSE2NEON_DEC(count), macro,                                \
-        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
-                                              __VA_ARGS__))
-#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
-
-#define SSE2NEON_SIZE_OF_byte 8
-#define SSE2NEON_NUMBER_OF_LANES_byte 16
-#define SSE2NEON_SIZE_OF_word 16
-#define SSE2NEON_NUMBER_OF_LANES_word 8
-
-#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
-    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
-        vreinterpretq_##type##_m128i(a)));
-
-#define SSE2NEON_FILL_LANE(i, type) \
-    vec_b[i] =                      \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
-
-#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
-                       number_of_lanes, byte_or_word)                         \
-    do {                                                                      \
-        SSE2NEON_CAT(                                                         \
-            data_type_prefix,                                                 \
-            SSE2NEON_CAT(size,                                                \
-                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
-        vec_b[number_of_lanes];                                               \
-        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
-            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
-            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
-                                      SSE2NEON_CAT(type_prefix, size)))       \
-        for (int i = 0; i < number_of_lanes; i++) {                           \
-            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
-                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
-                SSE2NEON_CAT(vreinterpretq_u,                                 \
-                             SSE2NEON_CAT(size, _m128i))(mask),               \
-                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a))))),        \
-                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
-        }                                                                     \
-    } while (0)
-
-#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
-    do {                                                                     \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
-                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
-                                      SSE2NEON_CAT(u, size)))                \
-    } while (0)
-
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
-    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
-                                                int lb)                       \
-    {                                                                         \
-        __m128i mtx[16];                                                      \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
-        return SSE2NEON_CAT(                                                  \
-            _sse2neon_aggregate_equal_any_,                                   \
-            SSE2NEON_CAT(                                                     \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
-                                             type))))(la, lb, mtx);           \
-    }
-
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
-    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
-                                                 int lb)                       \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_RANGES(                                                        \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_ranges_,                                       \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
-                                             type))))(la, lb, mtx);            \
-    }
-
-#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
-                                                    __m128i b, int lb)         \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_equal_ordered_,                                \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x,                                                \
-                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
-    }
-
-static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
-    prefix##IMPL(byte) \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
-
-static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        __m128i tmp = vreinterpretq_m128i_u32(
-            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
-        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
-                                       vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__)
-        int t = vaddvq_u32(vec_res) ? 1 : 0;
-#else
-        uint64x2_t sumh = vpaddlq_u32(vec_res);
-        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
-#endif
-        res |= (t << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        __m128i tmp = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
-        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
-                                       vreinterpretq_u16_m128i(tmp));
-        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
-        res |= (t << j);
-    }
-    return res;
-}
-
-#define SSE2NEON_CMP_RANGES_IS_BYTE 1
-#define SSE2NEON_CMP_RANGES_IS_WORD 0
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
-    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
-    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
-    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
-    prefix##IMPL(word, int, s, prefix##IS_WORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
-
-#undef SSE2NEON_CMP_RANGES_IS_BYTE
-#undef SSE2NEON_CMP_RANGES_IS_WORD
-
-static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint8x16_t mtx =
-        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x10000 - (1 << la);
-    int tb = 0x10000 - (1 << lb);
-    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
-    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
-    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
-
-    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
-    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
-    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
-    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
-    res_lo = vand_u8(res_lo, vec_mask);
-    res_hi = vand_u8(res_hi, vec_mask);
-
-    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
-    return res;
-}
-
-static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint16x8_t mtx =
-        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x100 - (1 << la);
-    int tb = 0x100 - (1 << lb);
-    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
-    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
-    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
-    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
-    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
-    mtx = vbslq_u16(vec1, tmp, mtx);
-    mtx = vandq_u16(mtx, vec_mask);
-    return _sse2neon_vaddvq_u16(mtx);
-}
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
-        int bound, int la, int lb, __m128i mtx[16])                            \
-    {                                                                          \
-        int res = 0;                                                           \
-        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
-        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
-            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
-            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
-        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
-                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
-            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
-        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
-        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
-        for (int j = 0; j < lb; j++) {                                         \
-            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
-                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
-        }                                                                      \
-        for (int j = lb; j < bound; j++) {                                     \
-            mtx[j] = vreinterpretq_m128i_u##size(                              \
-                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
-        }                                                                      \
-        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
-            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
-        for (int i = 0; i < bound; i++) {                                      \
-            int val = 1;                                                       \
-            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
-                val &= ptr[k * bound + j];                                     \
-            res += val << i;                                                   \
-        }                                                                      \
-        return res;                                                            \
-    }
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
-    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
-    prefix##IMPL(16, 8, prefix##IS_UWORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
-
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
-    prefix##IMPL(byte)                              \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
-
-#define SSE2NEON_CMPESTR_LIST                          \
-    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
-    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
-    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
-    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
-    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
-    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
-
-enum {
-#define _(name, func_suffix) name,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
-static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
-#define _(name, func_suffix) _sse2neon_##func_suffix,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-
-FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
-{
-    switch (imm8 & 0x30) {
-    case _SIDD_NEGATIVE_POLARITY:
-        res ^= 0xffffffff;
-        break;
-    case _SIDD_MASKED_NEGATIVE_POLARITY:
-        res ^= (1 << lb) - 1;
-        break;
-    default:
-        break;
-    }
-
-    return res & ((bound == 8) ? 0xFF : 0xFFFF);
-}
-
-FORCE_INLINE int _sse2neon_clz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanForward(&cnt, x))
-        return cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_clz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanReverse(&cnt, x))
-        return 31 - cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_ctz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
-{
-#if _MSC_VER
-    unsigned long cnt;
-#ifdef defined(SSE2NEON_HAS_BITSCAN64)
-    (defined(_M_AMD64) || defined(__x86_64__))
-        if((_BitScanForward64(&cnt, x))
-            return (int)(cnt);
-#else
-    if (_BitScanForward(&cnt, (unsigned long) (x)))
-        return (int) cnt;
-    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
-        return (int) (cnt + 32);
-#endif
-    return 64;
-#else
-    return x != 0 ? __builtin_ctzll(x) : 64;
-#endif
-}
-
-#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
-
-#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = (imm & 0x01) ? 8 : 16
-
-#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
-    int tmp1 = la ^ (la >> 31);                  \
-    la = tmp1 - (la >> 31);                      \
-    int tmp2 = lb ^ (lb >> 31);                  \
-    lb = tmp2 - (lb >> 31);                      \
-    la = SSE2NEON_MIN(la, bound);                \
-    lb = SSE2NEON_MIN(lb, bound)
-
-// Compare all pairs of character in string a and b,
-// then aggregate the result.
-// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
-// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
-// string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
-    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
-    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
-
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
-    return (r2 == 0) ? bound                                     \
-                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                      : _sse2neon_ctz(r2))
-
-#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
-    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if (imm8 & 0x40) {                                                         \
-        if (bound == 8) {                                                      \
-            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
-                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
-            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
-                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
-        } else {                                                               \
-            uint8x16_t vec_r2 =                                                \
-                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
-            uint8x16_t tmp =                                                   \
-                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
-        }                                                                      \
-    } else {                                                                   \
-        if (bound == 16) {                                                     \
-            dst = vreinterpretq_m128i_u16(                                     \
-                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
-        } else {                                                               \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
-        }                                                                      \
-    }                                                                          \
-    return dst
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and returns 1 if b did not contain a null character and the
-// resulting mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
-FORCE_INLINE int _mm_cmpestra(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    int lb_cpy = lb;
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return !r2 & (lb_cpy > bound);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
-FORCE_INLINE int _mm_cmpestrc(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
-FORCE_INLINE int _mm_cmpestri(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
-FORCE_INLINE __m128i
-_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
-FORCE_INLINE int _mm_cmpestro(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
-FORCE_INLINE int _mm_cmpestrs(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
-FORCE_INLINE int _mm_cmpestrz(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return lb <= (bound - 1);
-}
-
-#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
-    do {                                                                 \
-        if (imm8 & 0x01) {                                               \
-            uint16x8_t equal_mask_##str =                                \
-                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
-        } else {                                                         \
-            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
-                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
-        }                                                                \
-    } while (0)
-
-#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
-    int la, lb;                                  \
-    do {                                         \
-        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
-        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
-    } while (0)
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if b did not contain a null character and the resulting
-// mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
-FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return !r2 & (lb >= bound);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
-FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
-FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
-FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
-FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
-FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int la;
-    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
-FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int lb;
-    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
-    return lb <= (bound - 1);
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    return vreinterpretq_m128i_s64(vshrq_n_s64(
-        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
-        63));
-#endif
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32ch(crc, v);
-#else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cw(crc, v);
-#else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cb(crc, v);
-#else
-    crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
-#endif
-    return crc;
-}
-
-/* AES */
-
-#if !defined(__ARM_FEATURE_CRYPTO)
-/* clang-format off */
-#define SSE2NEON_AES_SBOX(w)                                           \
-    {                                                                  \
-        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
-        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
-        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
-        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
-        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
-        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
-        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
-        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
-        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
-        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
-        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
-        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
-        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
-        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
-        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
-        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
-        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
-        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
-        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
-        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
-        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
-        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
-        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
-        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
-        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
-        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
-        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
-        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
-        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
-        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
-        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
-        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
-        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
-        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
-        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
-        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
-        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
-    }
-#define SSE2NEON_AES_RSBOX(w)                                          \
-    {                                                                  \
-        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
-        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
-        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
-        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
-        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
-        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
-        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
-        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
-        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
-        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
-        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
-        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
-        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
-        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
-        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
-        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
-        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
-        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
-        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
-        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
-        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
-        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
-        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
-        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
-        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
-        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
-        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
-        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
-        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
-        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
-        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
-        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
-        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
-        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
-        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
-        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
-        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
-    }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2NEON_AES_H0(x) (x)
-static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
-static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
-#undef SSE2NEON_AES_H0
-
-/* x_time function and matrix multiply function */
-#if !defined(__aarch64__)
-#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-#define SSE2NEON_MULTIPLY(x, y)                                  \
-    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
-     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
-     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
-     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
-#endif
-
-// In the absence of crypto extensions, implement aesenc using regular NEON
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// for more information.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    /* shift rows */
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    /* sub bytes */
-    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-    // look up each of the table. After each lookup, we load the next table
-    // which locates at the next 64-bytes. In the meantime, the index in the
-    // table would be smaller than it was, so the index parameters of
-    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    /* mix columns */
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    /* add round key */
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A implementation for a table-based AES */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
-    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
-     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
-// muliplying 'x' by 2 in GF(2^8)
-#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-// muliplying 'x' by 3 in GF(2^8)
-#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
-#define SSE2NEON_AES_U0(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
-#define SSE2NEON_AES_U1(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
-#define SSE2NEON_AES_U2(p) \
-    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
-#define SSE2NEON_AES_U3(p) \
-    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
-
-    // this generates a table containing every possible permutation of
-    // shift_rows() and sub_bytes() with mix_columns().
-    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
-    };
-#undef SSE2NEON_AES_B2W
-#undef SSE2NEON_AES_F2
-#undef SSE2NEON_AES_F3
-#undef SSE2NEON_AES_U0
-#undef SSE2NEON_AES_U1
-#undef SSE2NEON_AES_U2
-#undef SSE2NEON_AES_U3
-
-    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
-    uint32_t x1 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
-    uint32_t x2 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
-    uint32_t x3 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
-
-    // finish the modulo addition step in mix_columns()
-    __m128i out = _mm_set_epi32(
-        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
-         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
-        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
-         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
-        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
-         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
-        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
-         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, RoundKey);
-#endif
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // inverse mix columns
-    // muliplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    // add round key
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t i, e, f, g, h, v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    // inverse mix columns
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    // sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A implementation */
-    uint8_t v[16] = {
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-    };
-
-    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (int i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-#if defined(__aarch64__)
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-    uint8x16_t v = vreinterpretq_u8_m128i(a);
-    uint8x16_t w;
-
-    // multiplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    // multiplying 'v' by 2 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-    return vreinterpretq_m128i_u8(w);
-
-#else /* ARMv7-A NEON implementation */
-    uint8_t i, e, f, g, h, v[4][4];
-    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
-#endif
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-//
-// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
-// This instruction generates a round key for AES encryption. See
-// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
-// for details.
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-#if defined(__aarch64__)
-    uint8x16_t _a = vreinterpretq_u8_m128i(a);
-    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
-
-    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
-    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
-    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
-
-    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
-
-#else /* ARMv7-A NEON implementation */
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
-    for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
-    }
-    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-#endif
-}
-#undef SSE2NEON_AES_SBOX
-#undef SSE2NEON_AES_RSBOX
-
-#if defined(__aarch64__)
-#undef SSE2NEON_XT
-#undef SSE2NEON_MULTIPLY
-#endif
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-        vreinterpretq_u8_m128i(b));
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(veorq_u8(
-        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-        vreinterpretq_u8_m128i(RoundKey)));
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-                         RoundKey);
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(
-        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
-        vreinterpretq_u8_m128i(RoundKey));
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst."
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-    // AESE does ShiftRows and SubBytes on A
-    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-
-    uint8x16_t dest = {
-        // Undo ShiftRows step from AESE and extract X1 and X3
-        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
-        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
-        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
-    };
-    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
-    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
-}
-#endif
-
-/* Others */
-
-// Perform a carry-less multiplication of two 64-bit integers, selected from a
-// and b according to imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
-    uint64x2_t a = vreinterpretq_u64_m128i(_a);
-    uint64x2_t b = vreinterpretq_u64_m128i(_b);
-    switch (imm & 0x11) {
-    case 0x00:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-    case 0x01:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-    case 0x10:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-    case 0x11:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
-    default:
-        abort();
-    }
-}
-
-FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
-}
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(a);
-#else
-    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
-    uint32_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-
-    vst1_u32(&count, count32x2_val);
-    return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(a);
-#else
-    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
-    uint64_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-    uint64x1_t count64x1_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-    count64x1_val = vpaddl_u32(count32x2_val);
-    vst1_u64(&count, count64x1_val);
-    return count;
-#endif
-}
-
-FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Return the current 64-bit value of the processor's time-stamp counter.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
-FORCE_INLINE uint64_t _rdtsc(void)
-{
-#if defined(__aarch64__)
-    uint64_t val;
-
-    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
-     * system counter is at least 56 bits wide; from Armv8.6, the counter
-     * must be 64 bits wide.  So the system counter could be less than 64
-     * bits wide and it is attributed with the flag 'cap_user_time_short'
-     * is true.
-     */
-    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
-
-    return val;
-#else
-    uint32_t pmccntr, pmuseren, pmcntenset;
-    // Read the user mode Performance Monitoring Unit (PMU)
-    // User Enable Register (PMUSERENR) access permissions.
-    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
-    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
-        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
-        if (pmcntenset & 0x80000000UL) {  // Is it counting?
-            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
-            // The counter is set up to count every 64th cycle
-            return (uint64_t) (pmccntr) << 6;
-        }
-    }
-
-    // Fallback to syscall as we can't enable PMUSERENR in user mode.
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
-#endif
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC pop_options
-#endif
-
-#endif
\ No newline at end of file
diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
new file mode 100644
index 0000000000..7dea6740cb
--- /dev/null
+++ b/share/cmake/modules/Findsse2neon.cmake
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Locate sse2neon (header-only version)
+#
+# Variables defined by this module:
+#   sse2neon_FOUND          - Indicate whether the library was found or not
+#   sse2neon_INCLUDE_DIR    - Location of the header files
+#
+# Global targets defined by this module:
+#   sse2neon
+###############################################################################
+### Try to find package ###
+
+if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
+    # Find include directory
+    find_path(sse2neon_INCLUDE_DIR
+        NAMES
+            sse2neon.h
+        HINTS
+            ${sse2neon_ROOT}
+        PATH_SUFFIXES
+            include
+            sse2neon/include
+    )
+
+    # Override REQUIRED if package can be installed
+    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
+        set(sse2neon_FIND_REQUIRED FALSE)
+    endif()
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(sse2neon
+        REQUIRED_VARS 
+            sse2neon_INCLUDE_DIR 
+    )
+    set(sse2neon_FOUND ${sse2neon_FOUND})
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(sse2neon_FOUND AND NOT TARGET sse2neon)
+    # INTERFACE type since we know that this is a header-only library.
+    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+    set(_sse2neon_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_sse2neon_TARGET_CREATE)
+    set_target_properties(sse2neon PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(sse2neon_INCLUDE_DIR)
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake
new file mode 100644
index 0000000000..16b47798cd
--- /dev/null
+++ b/share/cmake/modules/install/Installsse2neon.cmake
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install sse2neon (header-only version)
+# https://github.com/DLTcollab/sse2neon
+#
+#
+# Global targets defined by this module:
+#   sse2neon
+###############################################################################
+
+# Download sse2neon using FetchContent and make it available at configure time.
+
+include(FetchContent)
+
+set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon")
+FetchContent_Declare(sse2neon
+  GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
+  GIT_TAG        v1.6.0
+)
+
+# FetchContent_MakeAvailable is not available until CMake 3.14+.
+# Using FetchContent_GetProperties and FetchContent_Populate instead.
+FetchContent_GetProperties(sse2neon)
+
+if(NOT sse2neon_POPULATED)
+  FetchContent_Populate(sse2neon)
+
+  set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}")
+  file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon")
+
+  add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+  set_target_properties(sse2neon PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${_EXT_DIST_INCLUDE}/sse2neon"
+  )
+endif()
\ No newline at end of file
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 43a2ca75fb..1d2fc9bfd9 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -295,7 +295,7 @@ target_link_libraries(OpenColorIO
 if(OCIO_USE_SSE AND HAVE_NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
-            "$<BUILD_INTERFACE:sse2neon>"
+            sse2neon
     )
 endif()
 

From f9b5db50b6392bf0d0b67019c1a6a2c963779207 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 10 Feb 2023 14:45:24 -0500
Subject: [PATCH 04/81] Fix comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a2c1304a1..36631e653b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,7 +166,7 @@ include(CheckSupportARMNeon)
 
 
 ###############################################################################
-# Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
+# Add sse2neon to the build if ARM NEON intrinsics are supported.
 
 if(HAVE_NEON)
     # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is

From 44916db94312a00b653c30bca495e09e9c691cc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Tue, 14 Feb 2023 10:47:42 -0500
Subject: [PATCH 05/81] Re-wording comments Now check for OCIO_USE_SSE and
 HAVE_NEON before integrating sse2neon to the build.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt        |  2 +-
 src/OpenColorIO/SSE.h | 25 +++++++++++--------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 36631e653b..bc8124718a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,7 +168,7 @@ include(CheckSupportARMNeon)
 ###############################################################################
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
-if(HAVE_NEON)
+if(HAVE_NEON AND OCIO_USE_SSE)
     # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 567c5a1c64..f63e08cdef 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -24,26 +24,23 @@ namespace OCIO_NAMESPACE
 
 // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
 // it is redefining two of the functions from sse2neon.
+
 #ifdef USE_SSE2NEON
-    // Overwrite the translation of _mm_max_ps and _mm_min_ps.
-    // Using vmaxnmq_f32 and vminnmq_f32 instead.
-
-    // Compare packed single-precision (32-bit) floating-point elements in a and b,
-    // and store packed maximum values in dst. dst does not follow the IEEE Standard
-    // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-    // signed-zero values.
-    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+    // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
+    // NaN handling.
+
+    // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
+    // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
+    // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
+    // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
+    // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
+    // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
+    // the first argument continues to be filtered out.
     static inline __m128 _mm_max_ps(__m128 a, __m128 b)
     {
         return vreinterpretq_m128_f32(
             vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
     }
-
-    // Compare packed single-precision (32-bit) floating-point elements in a and b,
-    // and store packed minimum values in dst. dst does not follow the IEEE Standard
-    // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-    // signed-zero values.
-    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
     static inline __m128 _mm_min_ps(__m128 a, __m128 b)
     {
         return vreinterpretq_m128_f32(

From 0abeb06dfa4564df6870855fad17a6e02d16aba1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 15 Feb 2023 13:44:25 -0500
Subject: [PATCH 06/81] Removing specifics compiler options for NEON since they
 are not needed on Apple and might interfere with optimization if not careful.
 Updated CheckSupportSSE2.cmake and adding some checks in SSE.h to support
 universal build. Added comments and new define (USING_INTEL_SSE,
 USING_ARM_NEON and USING_CPP) in LogOpCPU_tests which should help with
 understand what is going on. Checking for SSE2 and ARM Neon only when
 OCIO_USE_SSE is ON.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |   6 +-
 share/cmake/utils/CheckSupportARMNeon.cmake |  30 ++--
 share/cmake/utils/CheckSupportSSE2.cmake    |  73 +++++++---
 share/cmake/utils/CompilerFlags.cmake       |  13 +-
 src/OpenColorIO/CMakeLists.txt              |  16 ++-
 src/OpenColorIO/SSE.h                       |  61 ++++----
 tests/cpu/CMakeLists.txt                    |  16 ++-
 tests/cpu/ops/log/LogOpCPU_tests.cpp        | 151 +++++++++++++-------
 8 files changed, 238 insertions(+), 128 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc8124718a..c14bd0abe6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -160,9 +160,11 @@ include(CheckSupportGL)
 
 
 ###############################################################################
-# Check for ARM neon intrinsics (armv8)
+# Check for ARM neon (if the user asked for it)
 
-include(CheckSupportARMNeon)
+if(OCIO_USE_SSE)
+    include(CheckSupportARMNeon)
+endif()
 
 
 ###############################################################################
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
index 1123e75f52..acab5e5946 100644
--- a/share/cmake/utils/CheckSupportARMNeon.cmake
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -1,21 +1,27 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 
+# Checks for ARM NEON availability
+
 include(CheckCXXSourceCompiles)
 
-set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
-set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+fp+simd+crypto+crc")
+set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+
+if(APPLE)
+    set(CMAKE_OSX_ARCHITECTURES "arm64")
+endif()
+
+set(source_code "
+#include <arm_neon.h>
+int main()
+{
+    float32x4_t v = vdupq_n_f32(0);
+    return 0;
+}")
 
-check_cxx_source_compiles ("
-    #include <arm_neon.h>
-    int main()
-    {
-        float32x4_t v = vdupq_n_f32(0);
-        return 0;
-    }"
-    HAVE_NEON)
+check_cxx_source_compiles ("${source_code}" HAVE_NEON)
 
-set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
-unset(_cmake_required_flags_old)
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
 
+unset(_cmake_osx_architectures_old)
 mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 5abd707cdb..311dbdbc87 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -6,6 +6,8 @@ include(CheckCXXSourceCompiles)
 set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
 set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
 
+set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
     # unrelated warnings causing a detection failure. So, the code disables all warnings to focus
@@ -17,30 +19,63 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     endif()
 endif()
 
-set(_SSE2_HEADER "#include <emmintrin.h>")
-if (HAVE_NEON)
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
-    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-    set(_SSE2_HEADER "#include <sse2neon.h>")
-endif()
 
-set(_SSE2_TEST_SOURCE_CODE "
-${_SSE2_HEADER}
-int main ()
-{
-    __m128d a, b;
-    double vals[2] = {0};
-    a = _mm_loadu_pd (vals);
-    b = _mm_add_pd (a,a);
-    _mm_storeu_pd (vals,b);
-    return (0);
-}")
+macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
+    set(_SSE2_TEST_SOURCE_CODE "
+    ${_check_sse2_header_}
+    int main ()
+    {
+        __m128d a, b;
+        double vals[2] = {0};
+        a = _mm_loadu_pd (vals);
+        b = _mm_add_pd (a,a);
+        _mm_storeu_pd (vals,b);
+        return (0);
+    }")
+
+    check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" ${_check_output_var_name_})
+    mark_as_advanced(${_check_output_var_name_})
+endmacro()
+
+if(NOT HAVE_NEON)
+    check_arm_neon_availability("#include <emmintrin.h>" HAVE_SSE2)
+elseif(APPLE)
+    # Test for both supported architectures
+    # x86_64 and arm64
+    set(ARCHITECTURES_LIST "arm64;x86_64")
+    
+    message(STATUS "Checking SSE2 support using SSE2NEON library for arm64 and x86_64 architectures")
+    foreach (current_arch IN LISTS ARCHITECTURES_LIST)
+
+        set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
+        
+        if(current_arch STREQUAL arm64)
+            if (HAVE_NEON)
+                set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+                set(_sse2_header_ "#include <sse2neon.h>")
 
-check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" HAVE_SSE2)
+                set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
+            endif()
+        elseif(current_arch STREQUAL x86_64)
+            set(_sse2_header_ "#include <emmintrin.h>")
+            set(_output_var_name_ "HAVE_SSE2")
+        endif()
+
+        check_arm_neon_availability("${_sse2_header_}" ${_output_var_name_})
+
+    endforeach()
+elseif(NOT APPLE)
+    # Other arm platform
+    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+    check_arm_neon_availability("#include <sse2neon.h>" HAVE_SSE2)
+endif()
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
 set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+
 unset(_cmake_required_flags_old)
 unset(_cmake_required_libraries_old)
+unset(_cmake_osx_architectures_old)
+
 
-mark_as_advanced(HAVE_SSE2)
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 1ee16596bd..42ad9a16a1 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -9,14 +9,16 @@ set(PLATFORM_COMPILE_FLAGS "")
 
 ###############################################################################
 # Define if SSE2 can be used.
-# Check for SSE2 first since some compile flags need to be set on Apple ARM.
 
-include(CheckSupportSSE2)
+# Check for SSE2 only if the user asked for it.
+if(OCIO_USE_SSE)
+    include(CheckSupportSSE2)
+endif()
 
-if(NOT HAVE_SSE2)
+if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
     set(OCIO_USE_SSE OFF)
-endif(NOT HAVE_SSE2)
+endif()
 
 ###############################################################################
 # Compile flags
@@ -53,9 +55,6 @@ elseif(USE_CLANG)
 
     # Use of 'register' specifier must be removed for C++17 support.
     set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wno-deprecated-register")
-    if (HAVE_SSE2 AND HAVE_NEON)
-        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
-    endif()
 elseif(USE_GCC)
 
     set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -DUSE_GCC")
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 1d2fc9bfd9..85ef53bb2a 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,7 +292,7 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
-if(OCIO_USE_SSE AND HAVE_NEON)
+if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
             sse2neon
@@ -330,15 +330,17 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX))
 endif()
 
 if(OCIO_USE_SSE)
-    target_compile_definitions(OpenColorIO
-        PRIVATE
-            USE_SSE
-    )
+    if(HAVE_SSE2)
+        target_compile_definitions(OpenColorIO
+            PRIVATE
+                USE_SSE
+        )
+    endif()
 
-    if(HAVE_NEON)
+    if(HAVE_SSE2_WITH_SSE2NEON)
         target_compile_definitions(OpenColorIO
             PRIVATE
-                USE_SSE2NEON
+                USE_SSE2_WITH_SSE2NEON
         )
     endif()
 endif()
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index f63e08cdef..3c40a858e1 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,14 +6,23 @@
 #define INCLUDED_OCIO_SSE_H
 
 
+#if !defined(USE_SSE)
+    #define USING_CPP 1
+#endif
+
 #ifdef USE_SSE
 
-#ifndef USE_SSE2NEON
+// If it is not arm64, same behavior as before.
+#if !defined(__aarch64__)
     #include <emmintrin.h>
-#else
-    #include <sse2neon.h>
+    #define USING_INTEL_SSE2 1
+#elif defined(__aarch64__)
+    // ARM architecture A64 (ARM64)
+    #if defined(USE_SSE2_WITH_SSE2NEON)
+        #include <sse2neon.h>
+        #define USING_INTEL_SSE2_WITH_SSE2NEON 1
+    #endif
 #endif
-#include <stdio.h>
 
 #include <OpenColorIO/OpenColorIO.h>
 
@@ -25,27 +34,29 @@ namespace OCIO_NAMESPACE
 // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
 // it is redefining two of the functions from sse2neon.
 
-#ifdef USE_SSE2NEON
-    // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
-    // NaN handling.
-
-    // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
-    // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
-    // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
-    // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
-    // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
-    // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
-    // the first argument continues to be filtered out.
-    static inline __m128 _mm_max_ps(__m128 a, __m128 b)
-    {
-        return vreinterpretq_m128_f32(
-            vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    }
-    static inline __m128 _mm_min_ps(__m128 a, __m128 b)
-    {
-        return vreinterpretq_m128_f32(
-            vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    }
+#if defined(__aarch64__)
+    #if defined(USE_SSE2_WITH_SSE2NEON)
+        // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
+        // NaN handling.
+
+        // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
+        // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
+        // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
+        // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
+        // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
+        // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
+        // the first argument continues to be filtered out.
+        static inline __m128 _mm_max_ps(__m128 a, __m128 b)
+        {
+            return vreinterpretq_m128_f32(
+                vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+        }
+        static inline __m128 _mm_min_ps(__m128 a, __m128 b)
+        {
+            return vreinterpretq_m128_f32(
+                vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+        }
+    #endif
 #endif
 
 // Macros for alignment declarations
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 102bed48c6..30716af51f 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -27,7 +27,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
-    if(OCIO_USE_SSE AND HAVE_NEON)
+    if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
         target_link_libraries(${TEST_BINARY}
             PRIVATE
                 sse2neon
@@ -52,15 +52,17 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     endif(PRIVATE_INCLUDES)
 
     if(OCIO_USE_SSE)
-        target_compile_definitions(${TEST_BINARY}
-            PRIVATE
-                USE_SSE
-        )
+        if (HAVE_SSE2)
+            target_compile_definitions(${TEST_BINARY}
+                PRIVATE
+                    USE_SSE
+            )
+        endif()
 
-        if(HAVE_NEON)
+        if(HAVE_SSE2_WITH_SSE2NEON)
             target_compile_definitions(${TEST_BINARY}
                 PRIVATE
-                    USE_SSE2NEON
+                    USE_SSE2_WITH_SSE2NEON
             )
         endif()
     endif(OCIO_USE_SSE)
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 7ac48ed1f6..c73541afb6 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -9,7 +9,6 @@
 
 namespace OCIO = OCIO_NAMESPACE;
 
-
 constexpr float qnan = std::numeric_limits<float>::quiet_NaN();
 constexpr float inf = std::numeric_limits<float>::infinity();
 
@@ -23,6 +22,7 @@ void TestLog(float logBase)
                                   0.f,        0.f,  0.f,     inf,
                                  -inf,       -inf, -inf,     0.f,
                                   0.f,        0.f,  0.f,    -inf };
+                                  
     float rgba[32] = {};
 
     OCIO::ConstLogOpDataRcPtr logOp = std::make_shared<OCIO::LogOpData>(
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -52,17 +52,26 @@ void TestLog(float logBase)
             expected = logf(std::max(minValue, (float)expected)) / logf(logBase);
         }
 
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_CLOSE(result, expected, error);
     }
 
     const float resMin = logf(minValue) / logf(logBase);
+
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
+
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], resMin, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
+
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -75,10 +84,16 @@ void TestLog(float logBase)
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
+
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], resMin, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
+
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
+
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], resMin, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -127,34 +142,45 @@ void TestAntiLog(float logBase)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison.
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[8], inf);
-    #endif
-#else
+
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[8], inf);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
+
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
-    #endif
-#else
+
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
+
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
-#ifdef USE_SSE
+
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
     OCIO_CHECK_EQUAL(rgba[24], 0.0f);
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
+
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -267,43 +293,48 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison.
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[8], inf);
-    #endif
-#else
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[8], inf);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
 
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], res0, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
-    #endif
-#else
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
     OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol);
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -407,31 +438,40 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_CLOSE(result, expected, error);
     }
 
     const float res0 = ComputeLin2LogEval(0.0f, redP);
     const float resMin = ComputeLin2LogEval(-100.0f, redP);
 
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], res0, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], res0, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], res0, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -462,25 +502,28 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
 #endif // USE_SSE
 
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba[0], -0.168771237955f, error);
     OCIO_CHECK_CLOSE(rgba[1], -0.048771237955f, error);
     OCIO_CHECK_CLOSE(rgba[2], -0.036771237955f, error);
+
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
 #endif // USE_SSE
-
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
-#ifdef USE_SSE
+   // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -500,19 +543,23 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRendererNoLS = OCIO::GetLogRenderer(lognols, true);
     pRendererNoLS->apply(rgbaImage, rgba_nols, numPixels);
 
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[0], -0.325512374199f, error);
     OCIO_CHECK_CLOSE(rgba_nols[1], -0.127141806077f, error);
     OCIO_CHECK_CLOSE(rgba_nols[2], -0.107304749265f, error);
+    
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
 #endif // USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[6], 0.68141615509f, error);
-    OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
+    OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -530,19 +577,25 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
 #endif // USE_SSE
+
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[0], -24.6f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[1], -0.264385618977f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[2], -0.20700938942f, error2);
+
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[4], 0.028548034423f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[5], 0.170878935551f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[6], 0.68141615509, error2);
+
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -553,14 +606,8 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 {
     // Inverse of previous test.
-    const float rgbaImage[12] = { -0.168771237955f,
-                                  -0.048771237955f,
-                                  -0.036771237955f,
-                                   0.f,
-                                   0.047228762045f,
-                                   0.170878935551f,
-                                   0.68141615509f,
-                                   0.f,
+    const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f,
+                                   0.047228762045f, 0.170878935551f, 0.68141615509f, 0.f,
                                   -inf,   inf,   qnan,  0.0f };
 
     float rgba[12] = {};
@@ -574,23 +621,29 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
 #endif // USE_SSE
+
+    // Evaluating output for input rgbaImage[0-2] = 
+    // { -0.168771237955f, -0.048771237955f, -0.036771237955f, ... }.
     OCIO_CHECK_CLOSE(rgba[0], -0.1f, error);
     OCIO_CHECK_CLOSE(rgba[1],  0.0f, error);
     OCIO_CHECK_CLOSE(rgba[2],  0.01f, error);
+
+    // Evaluating output for input rgbaImage[4-6] = 
+    // { 0.047228762045f, 0.170878935551f, 0.68141615509f, ... }.
     OCIO_CHECK_CLOSE(rgba[4],  0.08f, error);
     OCIO_CHECK_CLOSE(rgba[5],  0.16f, error);
     OCIO_CHECK_CLOSE(rgba[6],  1.16f, 10.0f * error);
+
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba[8], -inf);
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
-    #endif
-#else
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[9], inf);
 #endif
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));

From 13541cc1274449baa19d4ca53b60561ca7b155d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 09:18:49 -0500
Subject: [PATCH 07/81] Tweaking comments, some cmake variables naming and
 other minors changes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |  4 +-
 share/cmake/utils/CheckSupportARMNeon.cmake |  6 +--
 share/cmake/utils/CheckSupportSSE2.cmake    | 40 +++++++---------
 share/cmake/utils/CompilerFlags.cmake       |  1 -
 src/OpenColorIO/SSE.h                       |  6 +--
 tests/cpu/ops/log/LogOpCPU_tests.cpp        | 52 ++++++++++-----------
 6 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c14bd0abe6..9f911ca508 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -160,7 +160,7 @@ include(CheckSupportGL)
 
 
 ###############################################################################
-# Check for ARM neon (if the user asked for it)
+# Check for ARM neon
 
 if(OCIO_USE_SSE)
     include(CheckSupportARMNeon)
@@ -171,7 +171,7 @@ endif()
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
 if(HAVE_NEON AND OCIO_USE_SSE)
-    # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
+    # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
     # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
index acab5e5946..5d17854757 100644
--- a/share/cmake/utils/CheckSupportARMNeon.cmake
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -5,7 +5,7 @@
 
 include(CheckCXXSourceCompiles)
 
-set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(APPLE)
     set(CMAKE_OSX_ARCHITECTURES "arm64")
@@ -21,7 +21,7 @@ int main()
 
 check_cxx_source_compiles ("${source_code}" HAVE_NEON)
 
-set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
-unset(_cmake_osx_architectures_old)
+unset(_cmake_osx_architectures_orig)
 mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 311dbdbc87..26bc0b396d 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -3,10 +3,9 @@
 
 include(CheckCXXSourceCompiles)
 
-set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
-set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
-
-set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
+set(_cmake_required_libraries_orig "${CMAKE_REQUIRED_LIBRARIES}")
+set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
@@ -20,7 +19,7 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
 endif()
 
 
-macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
+macro(check_sse2_availability _check_sse2_header_ _check_output_var_name_)
     set(_SSE2_TEST_SOURCE_CODE "
     ${_check_sse2_header_}
     int main ()
@@ -38,8 +37,8 @@ macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
 endmacro()
 
 if(NOT HAVE_NEON)
-    check_arm_neon_availability("#include <emmintrin.h>" HAVE_SSE2)
-elseif(APPLE)
+    check_sse2_availability("#include <emmintrin.h>" HAVE_SSE2)
+elseif(APPLE AND HAVE_NEON)
     # Test for both supported architectures
     # x86_64 and arm64
     set(ARCHITECTURES_LIST "arm64;x86_64")
@@ -50,32 +49,25 @@ elseif(APPLE)
         set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
         
         if(current_arch STREQUAL arm64)
-            if (HAVE_NEON)
-                set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-                set(_sse2_header_ "#include <sse2neon.h>")
-
-                set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
-            endif()
+            set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+            set(_sse2_header_ "#include <sse2neon.h>")
+            set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
         elseif(current_arch STREQUAL x86_64)
             set(_sse2_header_ "#include <emmintrin.h>")
             set(_output_var_name_ "HAVE_SSE2")
         endif()
 
-        check_arm_neon_availability("${_sse2_header_}" ${_output_var_name_})
+        check_sse2_availability("${_sse2_header_}" ${_output_var_name_})
 
     endforeach()
-elseif(NOT APPLE)
-    # Other arm platform
-    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-    check_arm_neon_availability("#include <sse2neon.h>" HAVE_SSE2)
 endif()
 
-set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
-set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
-set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}")
+set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_orig}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
-unset(_cmake_required_flags_old)
-unset(_cmake_required_libraries_old)
-unset(_cmake_osx_architectures_old)
+unset(_cmake_required_flags_orig)
+unset(_cmake_required_libraries_orig)
+unset(_cmake_osx_architectures_orig)
 
 
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 42ad9a16a1..19f093f79b 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -10,7 +10,6 @@ set(PLATFORM_COMPILE_FLAGS "")
 ###############################################################################
 # Define if SSE2 can be used.
 
-# Check for SSE2 only if the user asked for it.
 if(OCIO_USE_SSE)
     include(CheckSupportSSE2)
 endif()
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 3c40a858e1..12b43e8972 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,11 +6,9 @@
 #define INCLUDED_OCIO_SSE_H
 
 
-#if !defined(USE_SSE)
+#ifndef USE_SSE
     #define USING_CPP 1
-#endif
-
-#ifdef USE_SSE
+#else
 
 // If it is not arm64, same behavior as before.
 #if !defined(__aarch64__)
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index c73541afb6..083cd6a2b9 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -60,17 +60,17 @@ void TestLog(float logBase)
 
     const float resMin = logf(minValue) / logf(logBase);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], resMin, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     if (logBase == 10.0f)
     {
@@ -85,15 +85,15 @@ void TestLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], resMin, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], resMin, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -148,7 +148,7 @@ void TestAntiLog(float logBase)
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[8], inf);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -156,11 +156,11 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -168,11 +168,11 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
@@ -180,7 +180,7 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -301,7 +301,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[8], inf);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -310,11 +310,11 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], res0, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -322,11 +322,11 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
@@ -334,7 +334,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -447,15 +447,15 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     const float res0 = ComputeLin2LogEval(0.0f, redP);
     const float resMin = ComputeLin2LogEval(-100.0f, redP);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], res0, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
@@ -463,20 +463,20 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], res0, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], res0, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
 
-OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
+OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 {
     constexpr int numPixels = 3;
     constexpr int numValues = 4 * numPixels;
@@ -603,7 +603,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO_CHECK_CLOSE(rgba_nobreak[10], -24.6f, error2);
 }
 
-OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
+OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 {
     // Inverse of previous test.
     const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f,

From e58cf29de2fb31d6db275f4e90fdd7cc83beacd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 11:34:52 -0500
Subject: [PATCH 08/81] Fixing issue with anti-log (SSE implementation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/SSE.h                | 80 ++++++++++++++++++----------
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 29 ----------
 2 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 12b43e8972..57ea87a08f 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -109,10 +109,10 @@ inline __m128 isNegativeSpecial(const __m128 x)
   return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(x), SIGN_SHIFT));
 }
 
-// Select function in SSE version 2
+// Bit-wise select function in SSE version 2
 //
-// Return the parameter arg_false when the parameter mask is 0x0,
-// or the parameter arg_true when the mask is 0xffffffff.
+// Return the parameter arg_false bit where the parameter mask is 0x0,
+// return the parameter arg_true bit where the mask is 0x1.
 //
 // Algorithm Explanation:
 //
@@ -142,7 +142,11 @@ inline __m128 isNegativeSpecial(const __m128 x)
 //
 inline __m128 sseSelect(const __m128& mask, const __m128& arg_true, const __m128& arg_false)
 {
-    return _mm_xor_ps( arg_false, _mm_and_ps( mask, _mm_xor_ps( arg_true, arg_false ) ) );
+    return _mm_xor_ps(                                  // bit-wise XOR of arg_false, (...)
+        arg_false, 
+        _mm_and_ps(                                     // bit-wise AND of mask, (...)
+            mask, 
+            _mm_xor_ps( arg_true, arg_false ) ) );      // bit-wise XOR of arg_true, arg_false
 }
 
 // Coefficients of Chebyshev (minimax) degree 5 polynomial
@@ -162,6 +166,10 @@ static const __m128 PNEXP2 = _mm_set1_ps((float)2.414427569091865207710e-1);
 static const __m128 PNEXP1 = _mm_set1_ps((float)6.930038344665415134202e-1);
 static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 
+// Note: The above polynomials have been chosen to acheive a precision of
+// approximately 15 bits of mantissa.
+
+
 // log2 function in SSE version 2
 //
 // The function log2() is evaluated by performing argument
@@ -169,12 +177,14 @@ static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 // over a restricted range.
 inline __m128 sseLog2(__m128 x)
 {
-    // y = log2( x ) = log2( 2^exposant * mantissa ) 
-    //               = exposant + log2( mantissa )
+    // y = log2( x ) = log2( 2^exponant * mantissa ) 
+    //               = exponant + log2( mantissa )
 
     __m128 mantissa
-        = _mm_or_ps(
-            _mm_andnot_ps(_mm_castsi128_ps(EMASK), x), EONE);
+        = _mm_or_ps(                                    // OR with EONE
+            _mm_andnot_ps(                              // NOT(EMASK) AND x
+                _mm_castsi128_ps(EMASK), x),            // reinterpret cast int to float
+            EONE);
 
     __m128 log2
         = _mm_add_ps(
@@ -198,14 +208,15 @@ inline __m128 sseLog2(__m128 x)
             PNLOG0);
 
     __m128i exponent
-        = _mm_sub_epi32(
-            _mm_srli_epi32(
-                _mm_and_si128(_mm_castps_si128(x),
+        = _mm_sub_epi32(                                // subtract EBIAS
+            _mm_srli_epi32(                             // right-shift by EXP_SHIFT
+                _mm_and_si128(_mm_castps_si128(x),      // bit-wise AND with EMASK
                     EMASK),
                 EXP_SHIFT),
             EBIAS);
 
-    log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(exponent));
+    log2 = _mm_add_ps(log2, 
+                      _mm_cvtepi32_ps(exponent));       // convert exponent to float
 
     return log2;
 }
@@ -224,24 +235,30 @@ inline __m128 sseExp2(__m128 x)
     // Compute the largest integer not greater than x, i.e., floor(x)
     // Note: cvttps_epi32 simply cast the float value to int. That means cvttps_epi32(-2.7) = -2
     // rather than -3, hence for negative numbers we need to add -1. This ensures that "fraction"
-    // is always in the range [0, 1).
+    // is always in the range [0, 1).  Note that _mm_castps_si128(0xFFFFFFFF) is -1.
+    // If x is outside the INT_MIN to INT_MAX range, _mm_cvttps_epi32 will return 0x80000000
+    // (i.e. INT_MIN, just the sign bit set), which Intel calls the "integer indefinite" value. 
+    // When 1 is subtracted from INT_MIN, it gives INT_MAX.  So floor_x is wrong for values
+    // outside [INT_MIN, INT_MAX] but it's ignored thanks to the checks at the bottom.
+    // It's also wrong for x=NaN, but again it's ok since the polynomial returns NaN and
+    // hence the output is NaN, regardless of floor_x.
     __m128i floor_x
-        = _mm_add_epi32(
-            _mm_cvttps_epi32(x),
-            _mm_castps_si128(
-                _mm_cmpnle_ps(EZERO, x)));
+        = _mm_add_epi32(                                // add a pair of integer arguments
+            _mm_cvttps_epi32(x),                        // convert float to int via truncation
+            _mm_castps_si128(                           // reinterpret cast float to int
+                _mm_cmpnle_ps(EZERO, x)));              // NOT( EZERO <= x ) ? 0xFFFFFFFF : 0
 
     // Compute exp2(floor_x) by moving floor_x to the exponent bits of the floating-point number.
     __m128 zf
-        = _mm_castsi128_ps(
-            _mm_slli_epi32(
-                _mm_add_epi32(floor_x, EBIAS),
+        = _mm_castsi128_ps(                             // reinterpret cast int to float
+            _mm_slli_epi32(                             // left shift by EXP_SHIFT
+                _mm_add_epi32(floor_x, EBIAS),          // add a pair of integer arguments
                 EXP_SHIFT));
 
-    __m128 iexp = _mm_cvtepi32_ps(floor_x);
-    __m128 fraction = _mm_sub_ps(x, iexp);
+    __m128 iexp = _mm_cvtepi32_ps(floor_x);             // convert floor_x to float
+    __m128 fraction = _mm_sub_ps(x, iexp);              // x - iexp
 
-    // Compute exp2(fraction) using a polynomial approximation
+    // Compute exp2(fraction) using a polynomial approximation.
     __m128 mexp
         = _mm_add_ps(
             _mm_mul_ps(
@@ -259,19 +276,26 @@ inline __m128 sseExp2(__m128 x)
                 fraction),
             PNEXP0);
 
-    __m128 exp2 = _mm_mul_ps(zf, mexp);
+    __m128 exp2 = _mm_mul_ps(zf, mexp);                 // zf * mexp
 
     // Handle underflow:
     // If the (unbiased) exponent of zf is less than -126, the result is smaller than
     // the smallest representable floating-point number and an underflow computation is
     // potentially happening. When this happens, force the result to zero.
-    exp2 = _mm_andnot_ps(_mm_cmplt_ps(iexp, ENEG126), exp2);
+    // Note that as described above, floor_x is inaccurate, so the test here uses x.
+    exp2 = _mm_andnot_ps(                               // NOT(...) AND exp2
+        _mm_cmplt_ps(x, ENEG126),                       // iexp < ENEG126 ? 0xFFFFFFFF : 0
+        exp2);
 
     // Handle overflow:
     // If the (unbiased) exponent of zf is greater than 127, the result is larger than
     // the largest representable floating-point number and an overflow computation is
     // potentially happening. When this happens, force the result to positive infinity.
-    exp2 = sseSelect(_mm_cmpgt_ps(iexp, EPOS127), EPOSINF, exp2);
+    // Note that as described above, floor_x is inaccurate, so the test here uses x.
+    exp2 = sseSelect(                                   // (...) is a mask to select EPOSINF, exp2
+        _mm_cmpgt_ps(x, EPOS127),                       // iexp > EPOS127 ? 0xFFFFFFFF : 0
+        EPOSINF, 
+        exp2);
 
     return exp2;
 }
@@ -630,7 +654,7 @@ inline void sseSinCos(const float x, float& sin_x, float& cos_x)
 } // namespace OCIO_NAMESPACE
 
 
-#endif
+#endif  // USE_SSE
 
 
-#endif
+#endif  // INCLUDED_OCIO_SSE_H
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 083cd6a2b9..35ec055856 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -149,11 +149,7 @@ void TestAntiLog(float logBase)
     }
 
     // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[8], inf);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
-#endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
     // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
@@ -161,11 +157,7 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
-#endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
     // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
@@ -173,11 +165,7 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
     // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
-    OCIO_CHECK_EQUAL(rgba[24], inf);
-#else
     OCIO_CHECK_EQUAL(rgba[24], 0.0f);
-#endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
     // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
@@ -302,12 +290,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
     // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[8], inf);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
-#endif
-
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
     // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
@@ -315,11 +298,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
-#endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
     // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
@@ -327,11 +306,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
     // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
-    OCIO_CHECK_EQUAL(rgba[24], inf);
-#else
     OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol);
-#endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
     // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
@@ -641,11 +616,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba[8], -inf);
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[9], inf);
-#endif
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
 }
 

From 50ae9dd241009c9801682b9bcdbf7d61692dc443 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 11:57:32 -0500
Subject: [PATCH 09/81] Adding a new option OCIO_USE_SIMD which does the same
 thing as OCIO_USE_SSE (they are sync up). Will replace OCIO_USE_SSE
 eventually.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .github/workflows/ci_workflow.yml     | 6 +++---
 CMakeLists.txt                        | 9 +++++----
 docs/quick_start/installation.rst     | 2 ++
 share/cmake/modules/FindOpenEXR.cmake | 2 ++
 share/cmake/utils/CompilerFlags.cmake | 4 ++--
 src/OpenColorIO/CMakeLists.txt        | 4 ++--
 tests/cpu/CMakeLists.txt              | 6 +++---
 tests/gpu/CMakeLists.txt              | 4 ++--
 tests/osl/CMakeLists.txt              | 4 ++--
 9 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
index 77527cd723..102d83914d 100644
--- a/.github/workflows/ci_workflow.yml
+++ b/.github/workflows/ci_workflow.yml
@@ -275,7 +275,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
@@ -441,7 +441,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
@@ -613,7 +613,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f911ca508..853a445afb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,12 +147,13 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
-
+# SIMD refers to either SIMD or Advanced SIMD terminology.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+mark_as_advanced(OCIO_USE_SSE)
+option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
 option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
-
 ###############################################################################
 # GPU configuration
 
@@ -162,7 +163,7 @@ include(CheckSupportGL)
 ###############################################################################
 # Check for ARM neon
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     include(CheckSupportARMNeon)
 endif()
 
@@ -170,7 +171,7 @@ endif()
 ###############################################################################
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
-if(HAVE_NEON AND OCIO_USE_SSE)
+if(HAVE_NEON AND OCIO_USE_SIMD)
     # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 600fa89e12..b31513c586 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -277,6 +277,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding)
 - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins)
 - ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations)
+  - ``OCIO_USE_SSE`` will be deprecated in favor of ``OCIO_USE_SIMD`` at some point in the future.
+- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations)
 - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests)
 - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests)
 - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering)
diff --git a/share/cmake/modules/FindOpenEXR.cmake b/share/cmake/modules/FindOpenEXR.cmake
index a8b8448fd6..b49dc59a74 100644
--- a/share/cmake/modules/FindOpenEXR.cmake
+++ b/share/cmake/modules/FindOpenEXR.cmake
@@ -241,6 +241,7 @@ if(_OpenEXR_TARGET_CREATE)
         IMPORTED_LOCATION ${IlmThread_LIBRARY}
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}"
         INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;OpenEXR::IlmThreadConfig;OpenEXR::Iex;Threads::Threads"
+        STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols"
     )
     set_target_properties(OpenEXR::IlmThreadConfig PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR};${OpenEXR_INCLUDE_DIR}/OpenEXR"
@@ -257,6 +258,7 @@ if(_OpenEXR_TARGET_CREATE)
         IMPORTED_LOCATION ${OpenEXRCore_LIBRARY}
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}"
         INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;ZLIB::ZLIB;\$<LINK_ONLY:Imath::ImathConfig>"
+        STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols"
     )
     set_target_properties(OpenEXR::OpenEXRUtil PROPERTIES
         IMPORTED_LOCATION ${OpenEXRUtil_LIBRARY}
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 19f093f79b..af8839391b 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -10,13 +10,13 @@ set(PLATFORM_COMPILE_FLAGS "")
 ###############################################################################
 # Define if SSE2 can be used.
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     include(CheckSupportSSE2)
 endif()
 
 if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
-    set(OCIO_USE_SSE OFF)
+    set(OCIO_USE_SIMD OFF)
 endif()
 
 ###############################################################################
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 85ef53bb2a..f75acce6a0 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,7 +292,7 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
-if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
+if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
             sse2neon
@@ -329,7 +329,7 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX))
     set_property(TARGET OpenColorIO PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     if(HAVE_SSE2)
         target_compile_definitions(OpenColorIO
             PRIVATE
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 30716af51f..6f7c2affc4 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -27,7 +27,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
-    if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
+    if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
         target_link_libraries(${TEST_BINARY}
             PRIVATE
                 sse2neon
@@ -51,7 +51,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
         )
     endif(PRIVATE_INCLUDES)
 
-    if(OCIO_USE_SSE)
+    if(OCIO_USE_SIMD)
         if (HAVE_SSE2)
             target_compile_definitions(${TEST_BINARY}
                 PRIVATE
@@ -65,7 +65,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
                     USE_SSE2_WITH_SSE2NEON
             )
         endif()
-    endif(OCIO_USE_SSE)
+    endif(OCIO_USE_SIMD)
 
     if(WIN32)
         # A windows application linking to eXpat static libraries must
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index 051abe72eb..38761173fe 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -26,12 +26,12 @@ set(SOURCES
 
 add_executable(test_gpu_exec ${SOURCES})
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
 	target_compile_definitions(test_gpu_exec
 		PRIVATE
 			USE_SSE
 	)
-endif(OCIO_USE_SSE)
+endif(OCIO_USE_SIMD)
 
 set_target_properties(test_gpu_exec PROPERTIES 
 	COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
diff --git a/tests/osl/CMakeLists.txt b/tests/osl/CMakeLists.txt
index 6e2c107a89..efe90f21cf 100644
--- a/tests/osl/CMakeLists.txt
+++ b/tests/osl/CMakeLists.txt
@@ -18,12 +18,12 @@ set(SOURCES
 
 add_executable(test_osl_exec ${SOURCES})
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     target_compile_definitions(test_osl_exec
         PRIVATE
             USE_SSE
     )
-endif(OCIO_USE_SSE)
+endif(OCIO_USE_SIMD)
 
 set_target_properties(test_osl_exec PROPERTIES 
     COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")

From 0bc82082e8f1ca0dc30e300f713e035644c79587 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 08:57:13 -0500
Subject: [PATCH 10/81] Fix last issue discovered by unit test in sseExp2
 Reverted defined changes in LogOpCPU_tests Updated ocio.bat with
 OCIO_USE_SIMD Minors comments changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                       |  1 -
 docs/quick_start/installation.rst    |  5 ++---
 share/dev/windows/ocio.bat           |  2 +-
 src/OpenColorIO/SSE.h                | 14 ++++++--------
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 22 +++++++++++-----------
 5 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 853a445afb..365726bfc9 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,7 +147,6 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
-# SIMD refers to either SIMD or Advanced SIMD terminology.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index b31513c586..c792a06edf 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -276,9 +276,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_USE_OIIO_FOR_APPS=OFF`` (Set ON to build tools with OpenImageIO rather than OpenEXR)
 - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding)
 - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins)
-- ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations)
-  - ``OCIO_USE_SSE`` will be deprecated in favor of ``OCIO_USE_SIMD`` at some point in the future.
-- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations)
+- ``-DOCIO_USE_SSE=ON`` (Deprecated -- please use OCIO_USE_SIMD)
+- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON)
 - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests)
 - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests)
 - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering)
diff --git a/share/dev/windows/ocio.bat b/share/dev/windows/ocio.bat
index 7f24bc279b..a4762a97d9 100644
--- a/share/dev/windows/ocio.bat
+++ b/share/dev/windows/ocio.bat
@@ -206,7 +206,7 @@ if !DO_CONFIGURE!==1 (
         -DOCIO_BUILD_TESTS=ON^
         -DOCIO_BUILD_GPU_TESTS=ON^
         -DOCIO_BUILD_DOCS=OFF^
-        -DOCIO_USE_SSE=ON^
+        -DOCIO_USE_SIMD=ON^
         -DOCIO_WARNING_AS_ERROR=ON^
         -DOCIO_BUILD_JAVA=OFF^
         "!OCIO_PATH!"
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 57ea87a08f..95d7b88d49 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,19 +6,17 @@
 #define INCLUDED_OCIO_SSE_H
 
 
-#ifndef USE_SSE
-    #define USING_CPP 1
-#else
+#if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON)
 
 // If it is not arm64, same behavior as before.
 #if !defined(__aarch64__)
-    #include <emmintrin.h>
-    #define USING_INTEL_SSE2 1
+    #if defined(USE_SSE)
+        #include <emmintrin.h>
+    #endif
 #elif defined(__aarch64__)
     // ARM architecture A64 (ARM64)
     #if defined(USE_SSE2_WITH_SSE2NEON)
         #include <sse2neon.h>
-        #define USING_INTEL_SSE2_WITH_SSE2NEON 1
     #endif
 #endif
 
@@ -82,7 +80,7 @@ static const __m128i EBIAS = _mm_set1_epi32(EXP_BIAS);
 static const __m128 EONE    = _mm_set1_ps(1.0f);
 static const __m128 EZERO   = _mm_set1_ps(0.0f);
 static const __m128 ENEG126 = _mm_set1_ps(-126.0f);
-static const __m128 EPOS127 = _mm_set1_ps(127.0f);
+static const __m128 EPOS128 = _mm_set1_ps(128.0f);
 
 static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits<float>::infinity());
 
@@ -293,7 +291,7 @@ inline __m128 sseExp2(__m128 x)
     // potentially happening. When this happens, force the result to positive infinity.
     // Note that as described above, floor_x is inaccurate, so the test here uses x.
     exp2 = sseSelect(                                   // (...) is a mask to select EPOSINF, exp2
-        _mm_cmpgt_ps(x, EPOS127),                       // iexp > EPOS127 ? 0xFFFFFFFF : 0
+        _mm_cmpge_ps(x, EPOS128),                       // iexp > EPOS128 ? 0xFFFFFFFF : 0
         EPOSINF, 
         exp2);
 
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 35ec055856..04430b75f2 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -71,7 +71,7 @@ void TestLog(float logBase)
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -431,7 +431,7 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -498,7 +498,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -534,7 +534,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From e96972908f70235982a07b8ed830a82ecb1310b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 09:24:39 -0500
Subject: [PATCH 11/81] Fixing typo, using #ifdef since this is what we were
 using before.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 04430b75f2..06e98fe0da 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -71,7 +71,7 @@ void TestLog(float logBase)
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USE_SSE
+#ifdef USE_SSE
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -431,7 +431,7 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -498,7 +498,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -534,7 +534,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From afd1effe2759db42a0546cc59d1a63db14220bbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 23 Feb 2023 10:40:07 -0500
Subject: [PATCH 12/81] Adding a line for universal build option for Cmake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 docs/quick_start/installation.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index c792a06edf..d17fb618a9 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -285,6 +285,10 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation)
 - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation)
 
+On the newest Apple chipset (M1+), a universal build can be done with the following option:
+
+- ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
+
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
 and therefore you will be limited to using OpenEXR files with these tools rather than the

From 70fc696669117e6c05017479237281f7a7a6cbe9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 3 Mar 2023 11:41:37 -0500
Subject: [PATCH 13/81] Documentations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                    | 4 +++-
 docs/quick_start/installation.rst | 3 +++
 src/OpenColorIO/SSE.h             | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 365726bfc9..18e637c465 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,9 +147,11 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
+# TODO Remove OCIO_USE_SSE once it is fully deprecated.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+# TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
-option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
+option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
 option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index d17fb618a9..710a625682 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -289,6 +289,9 @@ On the newest Apple chipset (M1+), a universal build can be done with the follow
 
 - ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
 
+Please note that OCIO dependencies must be built as universal libraries as well. If you can't build all the 
+dependencies as universal libraries, you can set ``OCIO_INSTAL_EXT_PACKAGES=ALL`` and OCIO will handle it.
+
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
 and therefore you will be limited to using OpenEXR files with these tools rather than the
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 95d7b88d49..81bd15c0dc 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -8,7 +8,7 @@
 
 #if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON)
 
-// If it is not arm64, same behavior as before.
+// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM).
 #if !defined(__aarch64__)
     #if defined(USE_SSE)
         #include <emmintrin.h>
@@ -110,7 +110,7 @@ inline __m128 isNegativeSpecial(const __m128 x)
 // Bit-wise select function in SSE version 2
 //
 // Return the parameter arg_false bit where the parameter mask is 0x0,
-// return the parameter arg_true bit where the mask is 0x1.
+// return the parameter arg_true bit where the mask is 1.
 //
 // Algorithm Explanation:
 //

From 9358c1de87a594fbb3614e36441bb608e0c7a6d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 6 Mar 2023 15:16:29 -0500
Subject: [PATCH 14/81] Universal build is the default for APPLE documentation
 tweak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                    | 10 +++++++++-
 docs/quick_start/installation.rst | 15 +++++++++------
 src/OpenColorIO/SSE.h             |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18e637c465..23377ac87f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,14 @@ if(APPLE AND NOT DEFINED CMAKE_OSX_DEPLOYMENT_TARGET)
 endif()
 
 
+###############################################################################
+# Universal library is the default for ARM architecture under MacOS.
+
+if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL ""))
+    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE)
+endif()
+
+
 ###############################################################################
 # Project definition.
 
@@ -148,7 +156,7 @@ endif()
 ###############################################################################
 # Optimization / internal linking preferences
 # TODO Remove OCIO_USE_SSE once it is fully deprecated.
-option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)" ON)
 # TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 710a625682..91265329d9 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -285,12 +285,15 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation)
 - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation)
 
-On the newest Apple chipset (M1+), a universal build can be done with the following option:
-
-- ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
-
-Please note that OCIO dependencies must be built as universal libraries as well. If you can't build all the 
-dependencies as universal libraries, you can set ``OCIO_INSTAL_EXT_PACKAGES=ALL`` and OCIO will handle it.
+On the MacOS under the ARM architecture, the default is to make a universal build 
+(natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option 
+may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``.
+
+When doing a universal build, note that the OCIO dependencies must be built as universal libraries 
+too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if 
+any of your installed libraries are not universal. The easiest way to address this is to set 
+OCIO_INSTALL_EXT_PACKAGES=ALL in order to let OCIO build everything. Alternatively, you may set 
+CMAKE_OSX_ARCHITECTURES to just the platform you are targeting.
 
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 81bd15c0dc..1aa853997e 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -33,7 +33,7 @@ namespace OCIO_NAMESPACE
 #if defined(__aarch64__)
     #if defined(USE_SSE2_WITH_SSE2NEON)
         // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
-        // NaN handling.
+        // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior.
 
         // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
         // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 

From c5ca7df69b855d198ba8dfd568b607f0a7773d35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Tue, 7 Mar 2023 09:59:05 -0500
Subject: [PATCH 15/81] documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23377ac87f..10c734cebf 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
 
 
 ###############################################################################
-# Universal library is the default for ARM architecture under MacOS.
+# By default, build the library, tests, tools, and Python binding as universal binaries for macOS.
 
 if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL ""))
     set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE)

From 5585483c9f407d03923cd44f3d802a6111ec62fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 13 Mar 2023 10:49:01 -0400
Subject: [PATCH 16/81] Fixing issue for the static build test in CI workflow
 Removing Findsse2neon as it is not needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                | 26 +++++++--
 share/cmake/modules/Findsse2neon.cmake        | 58 -------------------
 .../modules/install/Installsse2neon.cmake     | 15 +++--
 share/cmake/utils/CheckSupportSSE2.cmake      |  8 +--
 src/OpenColorIO/CMakeLists.txt                |  5 +-
 tests/cpu/CMakeLists.txt                      |  5 +-
 6 files changed, 39 insertions(+), 78 deletions(-)
 delete mode 100644 share/cmake/modules/Findsse2neon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccb643f837..fccafd2ae6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -207,11 +207,29 @@ if(HAVE_NEON AND OCIO_USE_SIMD)
     # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
-    # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
-    # recommended version branch is merged).
-    find_package(sse2neon QUIET)
-    if (NOT sse2neon_FOUND)
+    # Sse2neon is not treated like an imported target. The logic to find sse2neon is here because 
+    # a find module is not suitable for sse2neon's use case.
+    find_path(sse2neon_INCLUDE_DIR
+        NAMES
+            sse2neon.h
+        HINTS
+            ${sse2neon_ROOT}
+        PATH_SUFFIXES
+            sse2neon
+            include
+            sse2neon/include
+    )
+
+    if (NOT sse2neon_INCLUDE_DIR)
         include(Installsse2neon)
+    else()
+        # Any changes to the following lines must be replicated in Installsse2neon.cmake as well.
+        # Create a target for sse2neon (non-imported)
+        add_library(sse2neon INTERFACE)
+        # Add the include directories to the target.
+        target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
+        # Ignore the warnings coming from sse2neon.h as they are false positives.
+        target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
     endif()
 endif()
 
diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
deleted file mode 100644
index 7dea6740cb..0000000000
--- a/share/cmake/modules/Findsse2neon.cmake
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-#
-# Locate sse2neon (header-only version)
-#
-# Variables defined by this module:
-#   sse2neon_FOUND          - Indicate whether the library was found or not
-#   sse2neon_INCLUDE_DIR    - Location of the header files
-#
-# Global targets defined by this module:
-#   sse2neon
-###############################################################################
-### Try to find package ###
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-    # Find include directory
-    find_path(sse2neon_INCLUDE_DIR
-        NAMES
-            sse2neon.h
-        HINTS
-            ${sse2neon_ROOT}
-        PATH_SUFFIXES
-            include
-            sse2neon/include
-    )
-
-    # Override REQUIRED if package can be installed
-    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
-        set(sse2neon_FIND_REQUIRED FALSE)
-    endif()
-
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(sse2neon
-        REQUIRED_VARS 
-            sse2neon_INCLUDE_DIR 
-    )
-    set(sse2neon_FOUND ${sse2neon_FOUND})
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(sse2neon_FOUND AND NOT TARGET sse2neon)
-    # INTERFACE type since we know that this is a header-only library.
-    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-    set(_sse2neon_TARGET_CREATE TRUE)
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(_sse2neon_TARGET_CREATE)
-    set_target_properties(sse2neon PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
-    )
-
-    mark_as_advanced(sse2neon_INCLUDE_DIR)
-endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake
index 16b47798cd..5f0f810ca1 100644
--- a/share/cmake/modules/install/Installsse2neon.cmake
+++ b/share/cmake/modules/install/Installsse2neon.cmake
@@ -29,8 +29,15 @@ if(NOT sse2neon_POPULATED)
   set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}")
   file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon")
 
-  add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-  set_target_properties(sse2neon PROPERTIES
-      INTERFACE_INCLUDE_DIRECTORIES "${_EXT_DIST_INCLUDE}/sse2neon"
-  )
+  # sse2neon_INCLUDE_DIR is used internally for CheckSupportSSE2.cmake and to create sse2neon
+  # target for OCIO.
+  set(sse2neon_INCLUDE_DIR "${sse2neon_SOURCE_DIR}")
+
+  # Any changes to the following lines must be replicated in ./CMakeLists.txt as well.
+  # Create a target for sse2neon (non-imported)
+  add_library(sse2neon INTERFACE)
+  # Add the include directories to the target.
+  target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
+  # Ignore the warnings coming from sse2neon.h as they are false positives.
+  target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
 endif()
\ No newline at end of file
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 26bc0b396d..07fecbd7a5 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -4,7 +4,7 @@
 include(CheckCXXSourceCompiles)
 
 set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
-set(_cmake_required_libraries_orig "${CMAKE_REQUIRED_LIBRARIES}")
+set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}")
 set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
@@ -49,7 +49,7 @@ elseif(APPLE AND HAVE_NEON)
         set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
         
         if(current_arch STREQUAL arm64)
-            set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+            set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR})
             set(_sse2_header_ "#include <sse2neon.h>")
             set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
         elseif(current_arch STREQUAL x86_64)
@@ -63,11 +63,11 @@ elseif(APPLE AND HAVE_NEON)
 endif()
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}")
-set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_orig}")
+set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}")
 set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
 unset(_cmake_required_flags_orig)
-unset(_cmake_required_libraries_orig)
+unset(_cmake_required_includes_orig)
 unset(_cmake_osx_architectures_orig)
 
 
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 791ade3f91..608cbac63a 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -293,10 +293,7 @@ target_link_libraries(OpenColorIO
 )
 
 if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
-    target_link_libraries(OpenColorIO
-        PRIVATE
-            sse2neon
-    )
+    target_link_libraries(OpenColorIO PRIVATE $<BUILD_INTERFACE:sse2neon>)
 endif()
 
 if(APPLE)
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 245caeb9cb..edbb21ad2c 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -27,10 +27,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     )
 
     if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
-        target_link_libraries(${TEST_BINARY}
-            PRIVATE
-                sse2neon
-        )
+        target_link_libraries(${TEST_BINARY} PRIVATE sse2neon)
     endif()
 
     if(APPLE)

From 914867bee2910bd0316c013333047d58928f03a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 1 Feb 2023 09:34:47 -0500
Subject: [PATCH 17/81] Adding support for sse2neon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |   14 +
 ext/sse2neon/CMakeLists.txt                 |    9 +
 ext/sse2neon/src/include/sse2neon.h         | 9079 +++++++++++++++++++
 share/cmake/utils/CheckSupportARMNeon.cmake |   21 +
 share/cmake/utils/CheckSupportSSE2.cmake    |   35 +-
 share/cmake/utils/CompilerFlags.cmake       |   18 +-
 src/OpenColorIO/CMakeLists.txt              |   14 +
 src/OpenColorIO/SSE.h                       |    8 +-
 tests/cpu/CMakeLists.txt                    |   17 +
 9 files changed, 9199 insertions(+), 16 deletions(-)
 create mode 100644 ext/sse2neon/CMakeLists.txt
 create mode 100644 ext/sse2neon/src/include/sse2neon.h
 create mode 100644 share/cmake/utils/CheckSupportARMNeon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 932874f81c..a05df702b8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,6 +185,20 @@ message(STATUS "Checking for GPU configuration...")
 include(CheckSupportGL)
 
 
+###############################################################################
+# Check for ARM neon intrinsics (armv8)
+
+include(CheckSupportARMNeon)
+
+
+###############################################################################
+# Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
+
+if(HAVE_NEON)
+    add_subdirectory(ext/sse2neon)
+endif()
+
+
 ###############################################################################
 # Define compilation and link flags
 
diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
new file mode 100644
index 0000000000..7fd5fcb302
--- /dev/null
+++ b/ext/sse2neon/CMakeLists.txt
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+# sse2neon (modified)
+# https://github.com/DLTcollab/sse2neon
+add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+set_target_properties(sse2neon PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
+)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
new file mode 100644
index 0000000000..164c6c3387
--- /dev/null
+++ b/ext/sse2neon/src/include/sse2neon.h
@@ -0,0 +1,9079 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#else /* non-GNU / non-clang compilers */
+#warning "Macro name collisions may happen with unsupported compiler."
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(_WIN32)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+ * from both MinGW-w64 and MSVC.
+ */
+#define SSE2NEON_ALLOC_DEFINED
+#endif
+
+/* If using MSVC */
+#ifdef _MSC_VER
+#include <intrin.h>
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+    (defined(_M_ARM) || defined(__arm__))
+#define SSE2NEON_HAS_BITSCAN64
+#endif
+#endif
+
+/* Compiler barrier */
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+#if !defined(__aarch64__) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchtectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if !defined(__aarch64__)
+#include <sys/time.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+
+// __builtin_shuffle introduced in GCC 4.7.0
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+#define HAS__builtin_shuffle 1
+#else
+#define HAS__builtin_shuffle 0
+#endif
+
+#define HAS__builtin_shufflevector 0
+#define HAS__builtin_nontemporal_store 0
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif __has_builtin(__builtin_shuffle)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __extension__({                        \
+        type tmp = {__VA_ARGS__};          \
+        __builtin_shuffle(a, b, tmp);      \
+    })
+#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128();
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddv u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#else
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+#else
+// Wraps vaddvq_u8
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    return vaddvq_u8(a);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u16 variant */
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    uint32x4_t m = vpaddlq_u16(a);
+    uint64x2_t n = vpaddlq_u32(m);
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
+
+    return vget_lane_u32((uint32x2_t) o, 0);
+}
+#else
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+#if defined(__ARM_FEATURE_CRYPTO) && \
+    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+#endif
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+#if defined(aarch64__)
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+#else
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+#endif
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache heirarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr()
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                           \
+    __extension__({                                                        \
+        vreinterpret_m64_s16(vshuffle_s16(                                 \
+            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
+    })
+#else
+#define _mm_shuffle_pi16(a, imm)                                               \
+    __extension__({                                                            \
+        int16x4_t ret;                                                         \
+        ret =                                                                  \
+            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
+            1);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
+            2);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
+            3);                                                                \
+        vreinterpret_m64_s16(ret);                                             \
+    })
+#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 >= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 > *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 <= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 < *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
+                                                 vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double ret = *((double *) &a);
+    return (int32_t) ret;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+#else
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+#endif
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+#else
+    return _mm_set_pd(0, a);
+#endif
+}
+
+// Broadcast 16-bit integer a to all all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                         \
+    __extension__({                                                    \
+        int8x16_t ret;                                                 \
+        if (_sse2neon_unlikely(imm == 0))                              \
+            ret = vreinterpretq_s8_m128i(a);                           \
+        else if (_sse2neon_unlikely((imm) & ~15))                      \
+            ret = vdupq_n_s8(0);                                       \
+        else                                                           \
+            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
+                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
+        vreinterpretq_m128i_s8(ret);                                   \
+    })
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) == 0)) {                                \
+            ret = a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~15)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u16(                                   \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~31)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u32(                                   \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~63)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u64(                                   \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                       \
+    __extension__({                                                  \
+        int8x16_t ret;                                               \
+        if (_sse2neon_unlikely((imm) & ~15))                         \
+            ret = vdupq_n_s8(0);                                     \
+        else                                                         \
+            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
+                           (imm > 15 ? 0 : imm));                    \
+        vreinterpretq_m128i_s8(ret);                                 \
+    })
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+{
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (_sse2neon_unlikely((imm) >= 16)) {                              \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+#else
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
+    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(floor(f[1]), floor(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                              \
+    __extension__({                                                            \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
+                           vreinterpretq_f32_m128(a), 0);                      \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
+                           ((imm8 >> 4) & 0x3));                               \
+        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        vreinterpretq_m128_f32(                                                \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
+    })
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+#if defined(__aarch64__)
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+#else
+    // Find the minimum value
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+#endif
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+#if defined(__aarch64__)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t zf =
+        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t cf =
+        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t result = vandq_u64(zf, cf);
+    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+#if defined(__aarch64__)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#else
+        uint64x2_t sumh = vpaddlq_u32(vec_res);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+    case _SIDD_NEGATIVE_POLARITY:
+        res ^= 0xffffffff;
+        break;
+    case _SIDD_MASKED_NEGATIVE_POLARITY:
+        res ^= (1 << lb) - 1;
+        break;
+    default:
+        break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_clz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+#if _MSC_VER
+    unsigned long cnt;
+#ifdef defined(SSE2NEON_HAS_BITSCAN64)
+    (defined(_M_AMD64) || defined(__x86_64__))
+        if((_BitScanForward64(&cnt, x))
+            return (int)(cnt);
+#else
+    if (_BitScanForward(&cnt, (unsigned long) (x)))
+        return (int) cnt;
+    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+        return (int) (cnt + 32);
+#endif
+    return 64;
+#else
+    return x != 0 ? __builtin_ctzll(x) : 64;
+#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        } else {                                                               \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    } else {                                                                   \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        } else {                                                               \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        } else {                                                         \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+#if !defined(__aarch64__)
+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define SSE2NEON_MULTIPLY(x, y)                                  \
+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A implementation for a table-based AES */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+// muliplying 'x' by 2 in GF(2^8)
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+// muliplying 'x' by 3 in GF(2^8)
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+
+    // this generates a table containing every possible permutation of
+    // shift_rows() and sub_bytes() with mix_columns().
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
+    uint32_t x1 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
+    uint32_t x2 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
+    uint32_t x3 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
+
+    // finish the modulo addition step in mix_columns()
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // muliplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
+                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t i, e, f, g, h, v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    // inverse mix columns
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A implementation */
+    uint8_t v[16] = {
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+    };
+
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (int i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+#if defined(__aarch64__)
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+
+#else /* ARMv7-A NEON implementation */
+    uint8_t i, e, f, g, h, v[4][4];
+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+#endif
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+#if defined(__aarch64__)
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+
+#else /* ARMv7-A NEON implementation */
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+#endif
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
+        vreinterpretq_u8_m128i(RoundKey));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
\ No newline at end of file
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
new file mode 100644
index 0000000000..1123e75f52
--- /dev/null
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+include(CheckCXXSourceCompiles)
+
+set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
+set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+fp+simd+crypto+crc")
+
+check_cxx_source_compiles ("
+    #include <arm_neon.h>
+    int main()
+    {
+        float32x4_t v = vdupq_n_f32(0);
+        return 0;
+    }"
+    HAVE_NEON)
+
+set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
+unset(_cmake_required_flags_old)
+
+mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index f30bbb763c..5abd707cdb 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -4,6 +4,7 @@
 include(CheckCXXSourceCompiles)
 
 set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
+set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
@@ -16,20 +17,30 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     endif()
 endif()
 
-check_cxx_source_compiles ("
-    #include <emmintrin.h>
-    int main ()
-    {
-        __m128d a, b;
-        double vals[2] = {0};
-        a = _mm_loadu_pd (vals);
-        b = _mm_add_pd (a,a);
-        _mm_storeu_pd (vals,b);
-        return (0);
-    }"
-    HAVE_SSE2)
+set(_SSE2_HEADER "#include <emmintrin.h>")
+if (HAVE_NEON)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
+    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+    set(_SSE2_HEADER "#include <sse2neon.h>")
+endif()
+
+set(_SSE2_TEST_SOURCE_CODE "
+${_SSE2_HEADER}
+int main ()
+{
+    __m128d a, b;
+    double vals[2] = {0};
+    a = _mm_loadu_pd (vals);
+    b = _mm_add_pd (a,a);
+    _mm_storeu_pd (vals,b);
+    return (0);
+}")
+
+check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" HAVE_SSE2)
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
+set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
 unset(_cmake_required_flags_old)
+unset(_cmake_required_libraries_old)
 
 mark_as_advanced(HAVE_SSE2)
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index b11722f847..dc84eddf70 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -8,6 +8,20 @@
 set(PLATFORM_COMPILE_OPTIONS "")
 set(PLATFORM_LINK_OPTIONS "")
 
+###############################################################################
+# Define if SSE2 can be used.
+# Check for SSE2 first since some compile flags need to be set on Apple ARM.
+
+include(CheckSupportSSE2)
+
+if(NOT HAVE_SSE2)
+    message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
+    set(OCIO_USE_SSE OFF)
+endif(NOT HAVE_SSE2)
+
+###############################################################################
+# Compile flags
+
 if(USE_MSVC)
 
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/DUSE_MSVC")
@@ -40,7 +54,9 @@ elseif(USE_CLANG)
 
     # Use of 'register' specifier must be removed for C++17 support.
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-deprecated-register")
-
+    if (HAVE_SSE2 AND HAVE_NEON)
+        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS};-march=armv8-a+fp+simd+crypto+crc")
+    endif()
 elseif(USE_GCC)
 
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-DUSE_GCC")
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 6c2693db2f..bcfd356d53 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,6 +292,13 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
+if(OCIO_USE_SSE AND HAVE_NEON)
+    target_link_libraries(OpenColorIO
+        PRIVATE
+            "$<BUILD_INTERFACE:sse2neon>"
+    )
+endif()
+
 if(APPLE)
     target_link_libraries(OpenColorIO
         PRIVATE
@@ -327,6 +334,13 @@ if(OCIO_USE_SSE)
         PRIVATE
             USE_SSE
     )
+
+    if(HAVE_NEON)
+        target_compile_definitions(OpenColorIO
+            PRIVATE
+                USE_SSE2NEON
+        )
+    endif()
 endif()
 
 if(MSVC AND BUILD_TYPE_DEBUG AND BUILD_SHARED_LIBS)
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index a903120ec7..71b7faf6f9 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -8,11 +8,13 @@
 
 #ifdef USE_SSE
 
-
-#include <emmintrin.h>
+#ifndef USE_SSE2NEON
+    #include <emmintrin.h>
+#else
+    #include <sse2neon.h>
+#endif
 #include <stdio.h>
 
-
 #include <OpenColorIO/OpenColorIO.h>
 
 
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 431d570f4e..3d6cbc7506 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -26,6 +26,13 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
+    if(OCIO_USE_SSE AND HAVE_NEON)
+        target_link_libraries(${TEST_BINARY}
+            PRIVATE
+                sse2neon
+        )
+    endif()
+
     if(APPLE)
         # Frameworks needed to access the ICC monitor profile. 
         target_link_libraries(${TEST_BINARY}
@@ -43,12 +50,21 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
                 "${CMAKE_BINARY_DIR}/generated_include"
         )
     endif(PRIVATE_INCLUDES)
+
     if(OCIO_USE_SSE)
         target_compile_definitions(${TEST_BINARY}
             PRIVATE
                 USE_SSE
         )
+
+        if(HAVE_NEON)
+            target_compile_definitions(${TEST_BINARY}
+                PRIVATE
+                    USE_SSE2NEON
+            )
+        endif()
     endif(OCIO_USE_SSE)
+
     if(WIN32)
         # A windows application linking to eXpat static libraries must
         # have the global macro XML_STATIC defined
@@ -66,6 +82,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             )
         endif()
     endif(WIN32)
+
     set_target_properties(${TEST_BINARY} PROPERTIES
         COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
         LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"

From e674b5a83b301acb1ba8969c518ef55d56fd0890 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 9 Feb 2023 10:48:28 -0500
Subject: [PATCH 18/81] Overwriting sse2neon implementation for SSE2 _mm_max_ps
 and _mm_min_ps to fix issues with NaN handling. Fixing issues in the unit
 tests for some tests. For some tests, the neon implementation is matching the
 C++ implementation instead of the SSE2 implementation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/SSE.h                | 29 ++++++++++++++++++++++++++++
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 20 ++++++++++++++-----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 71b7faf6f9..567c5a1c64 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -22,6 +22,35 @@
 namespace OCIO_NAMESPACE
 {
 
+// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
+// it is redefining two of the functions from sse2neon.
+#ifdef USE_SSE2NEON
+    // Overwrite the translation of _mm_max_ps and _mm_min_ps.
+    // Using vmaxnmq_f32 and vminnmq_f32 instead.
+
+    // Compare packed single-precision (32-bit) floating-point elements in a and b,
+    // and store packed maximum values in dst. dst does not follow the IEEE Standard
+    // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+    // signed-zero values.
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+    static inline __m128 _mm_max_ps(__m128 a, __m128 b)
+    {
+        return vreinterpretq_m128_f32(
+            vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    }
+
+    // Compare packed single-precision (32-bit) floating-point elements in a and b,
+    // and store packed minimum values in dst. dst does not follow the IEEE Standard
+    // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+    // signed-zero values.
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+    static inline __m128 _mm_min_ps(__m128 a, __m128 b)
+    {
+        return vreinterpretq_m128_f32(
+            vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    }
+#endif
+
 // Macros for alignment declarations
 #define OCIO_SIMD_BYTES 16
 #if defined( _MSC_VER )
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 79649a23ae..7ac48ed1f6 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -130,7 +130,9 @@ void TestAntiLog(float logBase)
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[8], inf);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[8], inf);
+    #endif
 #else
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
@@ -138,7 +140,9 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
@@ -269,7 +273,9 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[8], inf);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[8], inf);
+    #endif
 #else
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
@@ -280,7 +286,9 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
 #ifdef USE_SSE
-    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
@@ -579,7 +587,9 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6],  1.16f, 10.0f * error);
     OCIO_CHECK_EQUAL(rgba[8], -inf);
 #ifdef USE_SSE
-    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[9], inf);
 #endif

From 7bd946ea78c1bb8bfbb3eb7e6b90526214f86db5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 10 Feb 2023 14:36:26 -0500
Subject: [PATCH 19/81] Changed how we handle sse2neon to match other
 dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                |   10 +-
 ext/sse2neon/CMakeLists.txt                   |    9 -
 ext/sse2neon/src/include/sse2neon.h           | 9079 -----------------
 share/cmake/modules/Findsse2neon.cmake        |   58 +
 .../modules/install/Installsse2neon.cmake     |   36 +
 src/OpenColorIO/CMakeLists.txt                |    2 +-
 6 files changed, 104 insertions(+), 9090 deletions(-)
 delete mode 100644 ext/sse2neon/CMakeLists.txt
 delete mode 100644 ext/sse2neon/src/include/sse2neon.h
 create mode 100644 share/cmake/modules/Findsse2neon.cmake
 create mode 100644 share/cmake/modules/install/Installsse2neon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a05df702b8..abe881c627 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -195,7 +195,15 @@ include(CheckSupportARMNeon)
 # Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
 
 if(HAVE_NEON)
-    add_subdirectory(ext/sse2neon)
+    # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
+    # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
+
+    # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
+    # recommended version branch is merged).
+    find_package(sse2neon QUIET)
+    if (NOT sse2neon_FOUND)
+        include(Installsse2neon)
+    endif()
 endif()
 
 
diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
deleted file mode 100644
index 7fd5fcb302..0000000000
--- a/ext/sse2neon/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-
-# sse2neon (modified)
-# https://github.com/DLTcollab/sse2neon
-add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-set_target_properties(sse2neon PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
-)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
deleted file mode 100644
index 164c6c3387..0000000000
--- a/ext/sse2neon/src/include/sse2neon.h
+++ /dev/null
@@ -1,9079 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// Contributors to this work are:
-//   John W. Ratcliff <jratcliffscarab@gmail.com>
-//   Brandon Rowlett <browlett@nvidia.com>
-//   Ken Fast <kfast@gdeb.com>
-//   Eric van Beurden <evanbeurden@nvidia.com>
-//   Alexander Potylitsin <apotylitsin@nvidia.com>
-//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
-//   Jim Huang <jserv@ccns.ncku.edu.tw>
-//   Mark Cheng <marktwtn@gmail.com>
-//   Malcolm James MacLeod <malcolm@gulden.com>
-//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
-//   Sebastian Pop <spop@amazon.com>
-//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
-//   Danila Kutenin <danilak@google.com>
-//   François Turban (JishinMaster) <francois.turban@gmail.com>
-//   Pei-Hsuan Hung <afcidk@gmail.com>
-//   Yang-Hao Yuan <yuanyanghau@gmail.com>
-//   Syoyo Fujita <syoyo@lighttransport.com>
-//   Brecht Van Lommel <brecht@blender.org>
-//   Jonathan Hue <jhue@adobe.com>
-//   Cuda Chen <clh960524@gmail.com>
-//   Aymen Qader <aymen.qader@arm.com>
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Tunable configurations */
-
-/* Enable precise implementation of math operations
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
- */
-/* _mm_min|max_ps|ss|pd|sd */
-#ifndef SSE2NEON_PRECISE_MINMAX
-#define SSE2NEON_PRECISE_MINMAX (0)
-#endif
-/* _mm_rcp_ps and _mm_div_ps */
-#ifndef SSE2NEON_PRECISE_DIV
-#define SSE2NEON_PRECISE_DIV (0)
-#endif
-/* _mm_sqrt_ps and _mm_rsqrt_ps */
-#ifndef SSE2NEON_PRECISE_SQRT
-#define SSE2NEON_PRECISE_SQRT (0)
-#endif
-/* _mm_dp_pd */
-#ifndef SSE2NEON_PRECISE_DP
-#define SSE2NEON_PRECISE_DP (0)
-#endif
-
-/* compiler specific definitions */
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
-#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
-#else /* non-GNU / non-clang compilers */
-#warning "Macro name collisions may happen with unsupported compiler."
-#ifndef FORCE_INLINE
-#define FORCE_INLINE static inline
-#endif
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#define _sse2neon_likely(x) (x)
-#define _sse2neon_unlikely(x) (x)
-#endif
-
-/* C language does not allow initializing a variable with a function call. */
-#ifdef __cplusplus
-#define _sse2neon_const static const
-#else
-#define _sse2neon_const const
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#if defined(_WIN32)
-/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
- * from both MinGW-w64 and MSVC.
- */
-#define SSE2NEON_ALLOC_DEFINED
-#endif
-
-/* If using MSVC */
-#ifdef _MSC_VER
-#include <intrin.h>
-#if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM) || defined(__arm__))
-#define SSE2NEON_HAS_BITSCAN64
-#endif
-#endif
-
-/* Compiler barrier */
-#define SSE2NEON_BARRIER()                     \
-    do {                                       \
-        __asm__ __volatile__("" ::: "memory"); \
-        (void) 0;                              \
-    } while (0)
-
-/* Memory barriers
- * __atomic_thread_fence does not include a compiler barrier; instead,
- * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
- * semantics.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-#include <stdatomic.h>
-#endif
-
-FORCE_INLINE void _sse2neon_smp_mb(void)
-{
-    SSE2NEON_BARRIER();
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
-    !defined(__STDC_NO_ATOMICS__)
-    atomic_thread_fence(memory_order_seq_cst);
-#elif defined(__GNUC__) || defined(__clang__)
-    __atomic_thread_fence(__ATOMIC_SEQ_CST);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
- */
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("fpu=neon")
-#endif
-#elif defined(__aarch64__)
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("+simd")
-#endif
-#elif __ARM_ARCH == 8
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error \
-    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#endif
-#else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
-#endif
-#endif
-
-#include <arm_neon.h>
-#if !defined(__aarch64__) && (__ARM_ARCH == 8)
-#if defined __has_include && __has_include(<arm_acle.h>)
-#include <arm_acle.h>
-#endif
-#endif
-
-/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
- * and other Arm microarchtectures use.
- * From sysctl -a on Apple M1:
- * hw.cachelinesize: 128
- */
-#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
-#define SSE2NEON_CACHELINE_SIZE 128
-#else
-#define SSE2NEON_CACHELINE_SIZE 64
-#endif
-
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
-#include <math.h>
-#endif
-
-/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
- * or even not accessible in user mode.
- * To write or access to these registers in user mode,
- * we have to perform syscall instead.
- */
-#if !defined(__aarch64__)
-#include <sys/time.h>
-#endif
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if defined(__GNUC__) && (__GNUC__ <= 9)
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-
-// __builtin_shuffle introduced in GCC 4.7.0
-#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
-#define HAS__builtin_shuffle 1
-#else
-#define HAS__builtin_shuffle 0
-#endif
-
-#define HAS__builtin_shufflevector 0
-#define HAS__builtin_nontemporal_store 0
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
-    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-#if __has_builtin(__builtin_shufflevector)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __builtin_shufflevector(a, b, __VA_ARGS__)
-#elif __has_builtin(__builtin_shuffle)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __extension__({                        \
-        type tmp = {__VA_ARGS__};          \
-        __builtin_shuffle(a, b, tmp);      \
-    })
-#endif
-
-#ifdef _sse2neon_shuffle
-#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
-#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
-#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
-#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
-#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
-#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
-#endif
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-#define _MM_FROUND_RAISE_EXC 0x00
-#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
-#define _MM_ROUND_NEAREST 0x0000
-#define _MM_ROUND_DOWN 0x2000
-#define _MM_ROUND_UP 0x4000
-#define _MM_ROUND_TOWARD_ZERO 0x6000
-/* Flush zero mode macros. */
-#define _MM_FLUSH_ZERO_MASK 0x8000
-#define _MM_FLUSH_ZERO_ON 0x8000
-#define _MM_FLUSH_ZERO_OFF 0x0000
-/* Denormals are zeros mode macros. */
-#define _MM_DENORMALS_ZERO_MASK 0x0040
-#define _MM_DENORMALS_ZERO_ON 0x0040
-#define _MM_DENORMALS_ZERO_OFF 0x0000
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef int64x1_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-// __int64 is defined in the Intrinsics Guide which maps to different datatype
-// in different data model
-#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
-#if (defined(__x86_64__) || defined(__i386__))
-#define __int64 long long
-#else
-#define __int64 int64_t
-#endif
-#endif
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
-#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64_s64(x) (x)
-
-#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
-#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
-#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
-
-#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64(x) (x)
-
-#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
-
-#if defined(__aarch64__)
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
-
-#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
-#define vreinterpretq_m128d_f64(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
-
-#define vreinterpretq_f64_m128d(x) (x)
-#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
-#else
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128d_f32(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_f32_m128d(x) (x)
-#endif
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an __m128 struct
-// directly.  It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://learn.microsoft.com/en-us/cpp/cpp/m128
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides.  This union should really only be used when
-// trying to access the members of the vector as integer values.  GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit.  If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components.  The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
-    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
-    int8_t m128_i8[16];    // as signed 8-bit integers.
-    int16_t m128_i16[8];   // as signed 16-bit integers.
-    int32_t m128_i32[4];   // as signed 32-bit integers.
-    int64_t m128_i64[2];   // as signed 64-bit integers.
-    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
-    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
-    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
-    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
-
-/* SSE macros */
-#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
-#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
-#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
-#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
-
-// Function declaration
-// SSE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
-FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
-FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
-FORCE_INLINE __m128 _mm_set_ps1(float);
-FORCE_INLINE __m128 _mm_setzero_ps(void);
-// SSE2
-FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_castps_si128(__m128);
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
-FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
-FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
-FORCE_INLINE __m128d _mm_set_pd(double, double);
-FORCE_INLINE __m128i _mm_set1_epi32(int);
-FORCE_INLINE __m128i _mm_setzero_si128();
-// SSE4.1
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
-FORCE_INLINE __m128 _mm_ceil_ps(__m128);
-FORCE_INLINE __m128d _mm_floor_pd(__m128d);
-FORCE_INLINE __m128 _mm_floor_ps(__m128);
-FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
-FORCE_INLINE __m128 _mm_round_ps(__m128, int);
-// SSE4.2
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__) &&                        \
-    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
-     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
-     (__GNUC__ <= 9 && defined(__aarch64__)))
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    uint8x16x4_t ret;
-    ret.val[0] = vld1q_u8(p + 0);
-    ret.val[1] = vld1q_u8(p + 16);
-    ret.val[2] = vld1q_u8(p + 32);
-    ret.val[3] = vld1q_u8(p + 48);
-    return ret;
-}
-#else
-// Wraps vld1q_u8_x4
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    return vld1q_u8_x4(p);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddv u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
-    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
-}
-#else
-// Wraps vaddv_u8
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    return vaddv_u8(v8);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
-    uint8_t res = 0;
-    for (int i = 0; i < 8; ++i)
-        res += tmp[i];
-    return res;
-}
-#else
-// Wraps vaddvq_u8
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    return vaddvq_u8(a);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u16 variant */
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    uint32x4_t m = vpaddlq_u16(a);
-    uint64x2_t n = vpaddlq_u32(m);
-    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
-
-    return vget_lane_u32((uint32x2_t) o, 0);
-}
-#else
-// Wraps vaddvq_u16
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    return vaddvq_u16(a);
-}
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- *   _mm_<name>_<data_type>
- *
- * The parts of this format are given as follows:
- * 1. <name> describes the operation performed by the intrinsic
- * 2. <data_type> identifies the data type of the function's primary arguments
- *
- * This last part, <data_type>, is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- *                      than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- *   // Set packed 8-bit integers
- *   // 128 bits, 16 chars, per 8 bits
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
- *   // Shuffle packed 8-bit integers
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- */
-
-/* Constants for use with _mm_prefetch. */
-enum _mm_hint {
-    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
-    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
-    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
-    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
-};
-
-// The bit field mapping to the FPCR(floating-point control register)
-typedef struct {
-    uint16_t res0;
-    uint8_t res1 : 6;
-    uint8_t bit22 : 1;
-    uint8_t bit23 : 1;
-    uint8_t bit24 : 1;
-    uint8_t res2 : 7;
-#if defined(__aarch64__)
-    uint32_t res3;
-#endif
-} fpcr_bitfield;
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-    float32x2_t a21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-    float32x2_t a03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-    float32x2_t a33 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
-{
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
-}
-
-#if defined(__ARM_FEATURE_CRYPTO) && \
-    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else  // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-                                    vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-                                    vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
-    uint8x16_t e =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
-    uint8x16_t f =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
-    uint8x16_t g =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
-    uint8x16_t h =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
-    uint8x16_t i =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
-    uint8x16_t j =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
-    uint8x16_t k =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f);  // L = E + F
-    uint8x16_t m = veorq_u8(g, h);  // M = G + H
-    uint8x16_t n = veorq_u8(i, j);  // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix = veorq_u8(d, cross1);
-    uint8x16_t r = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
-}
-#endif  // ARMv7 polyfill
-
-// C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
-//       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-#define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most significant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least significant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
-#else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
-#endif
-
-// NEON does not support a general purpose permute intrinsic.
-// Shuffle single-precision (32-bit) floating-point elements in a using the
-// control in imm8, and store the results in dst.
-//
-// C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
-//       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
-    __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
-    })
-
-// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
-// Store the results in the low 64 bits of dst, with the high 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
-#define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-        int16x4_t lowBits = vget_low_s16(ret);                                \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
-                             1);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
-                             2);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
-                             3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
-
-// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
-// Store the results in the high 64 bits of dst, with the low 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
-#define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-        int16x4_t highBits = vget_high_s16(ret);                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
-                             5);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
-                             6);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
-                             7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
-
-/* MMX */
-
-//_mm_empty is a no-op on arm
-FORCE_INLINE void _mm_empty(void) {}
-
-/* SSE */
-
-// Add packed single-precision (32-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Add the lower single-precision (32-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-    // the upper values in the result must be the remnants of <a>.
-    return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Compute the bitwise AND of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vbicq_s32(vreinterpretq_s32_m128(b),
-                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(
-        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmple_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
-//
-// See also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
-    // Note: NEON does not have ordered compare builtin
-    // Need to compare a eq a and b eq b to check for NaN
-    // Do AND of results to get final
-    uint32x4_t ceqaa =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t ceqbb =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-{
-    uint32x4_t f32a =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t f32b =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_eq_b =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_ge_b =
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_gt_b =
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_le_b =
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_le_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_lt_b =
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-    return !_mm_comieq_ss(a, b);
-}
-
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
-FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
-#else
-    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
-#endif
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
-                          0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int32_t) data;
-#endif
-}
-
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
-}
-
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then convert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
-}
-
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 16-bit integers, and store the results in dst. Note: this intrinsic
-// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
-// 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
-FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
-{
-    return vreinterpret_m64_s16(
-        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
-#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 8-bit integers, and store the results in lower 4 elements of dst.
-// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
-// between 0x7F and 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
-FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
-{
-    return vreinterpret_m64_s8(vqmovn_s16(
-        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
-}
-
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
-}
-
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_u32(
-        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
-#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
-
-// Convert the signed 64-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
-FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
-#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
-FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int64_t) data;
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
-FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
-{
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
-FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
-{
-    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
-#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
-#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
-FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
-{
-    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Divide packed single-precision (32-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#endif
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
-#endif
-}
-
-// Divide the lower single-precision (32-bit) floating-point element in a by the
-// lower single-precision (32-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper 3 packed elements from a to
-// the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
-#define _mm_extract_pi16(a, imm) \
-    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
-
-// Free aligned memory that was allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void _mm_free(void *addr)
-{
-    free(addr);
-}
-#endif
-
-// Macro: Get the flush zero bits from the MXCSR control and status register.
-// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
-// _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
-FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
-}
-
-// Macro: Get the rounding mode bits from the MXCSR control and status register.
-// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
-// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
-    }
-}
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm)                               \
-    __extension__({                                              \
-        vreinterpret_m64_s16(                                    \
-            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[63:32] := MEM[mem_addr+31:mem_addr]
-//   dst[95:64] := MEM[mem_addr+31:mem_addr]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
-
-// Load a single-precision (32-bit) floating-point element from memory into the
-// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
-}
-
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
-{
-    float32x4_t v = vrev64q_f32(vld1q_f32(p));
-    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
-}
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
-    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-    // equivalent for neon
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load unaligned 16-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
-{
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
-}
-
-// Load unaligned 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
-}
-
-// Allocate size bytes of memory, aligned to the alignment specified in align,
-// and return a pointer to the allocated memory. _mm_free should be used to free
-// memory that is allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-    void *ptr;
-    if (align == 1)
-        return malloc(size);
-    if (align == 2 || (sizeof(void *) == 8 && align == 4))
-        align = sizeof(void *);
-    if (!posix_memalign(&ptr, align, size))
-        return ptr;
-    return NULL;
-}
-#endif
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
-FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
-{
-    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x8_t masked =
-        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
-                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
-    vst1_s8((int8_t *) mem_addr, masked);
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
-#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed maximum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed minimum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the lower single-precision (32-bit) floating-point element from b to the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
-                       vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the upper 2 single-precision (32-bit) floating-point elements from b to
-// the lower 2 elements of dst, and copy the upper 2 elements from a to the
-// upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
-{
-#if defined(aarch64__)
-    return vreinterpretq_m128_u64(
-        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
-#else
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-#endif
-}
-
-// Move the lower 2 single-precision (32-bit) floating-point elements from b to
-// the upper 2 elements of dst, and copy the lower 2 elements from a to the
-// lower 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
-FORCE_INLINE int _mm_movemask_pi8(__m64 a)
-{
-    uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__)
-    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint8x8_t tmp = vshr_n_u8(input, 7);
-    return vaddv_u8(vshl_u8(tmp, shift));
-#else
-    // Refer the implementation of `_mm_movemask_epi8`
-    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
-    uint32x2_t paired16 =
-        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
-    uint8x8_t paired32 =
-        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
-    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
-#endif
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed single-precision (32-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-    uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
-    uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
-#else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Multiply packed single-precision (32-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_mul_ps(a, b));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(vshrn_n_u32(
-        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
-}
-
-// Compute the bitwise OR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
-#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
-#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
-#define _m_pmovmskb(a) _mm_movemask_pi8(a)
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
-// Fetch the line of data from memory that contains address p to a location in
-// the cache heirarchy specified by the locality hint i.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
-FORCE_INLINE void _mm_prefetch(char const *p, int i)
-{
-    switch (i) {
-    case _MM_HINT_NTA:
-        __builtin_prefetch(p, 0, 0);
-        break;
-    case _MM_HINT_T0:
-        __builtin_prefetch(p, 0, 3);
-        break;
-    case _MM_HINT_T1:
-        __builtin_prefetch(p, 0, 2);
-        break;
-    case _MM_HINT_T2:
-        __builtin_prefetch(p, 0, 1);
-        break;
-    }
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
-#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
-
-// Compute the approximate reciprocal of packed single-precision (32-bit)
-// floating-point elements in a, and store the results in dst. The maximum
-// relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#endif
-    return vreinterpretq_m128_f32(recip);
-}
-
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
-    return _mm_move_ss(a, _mm_rcp_ps(a));
-}
-
-// Compute the approximate reciprocal square root of packed single-precision
-// (32-bit) floating-point elements in a, and store the results in dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-#if SSE2NEON_PRECISE_SQRT
-    // Additional Netwon-Raphson iteration for accuracy
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-#endif
-    return vreinterpretq_m128_f32(out);
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
-{
-    uint64x1_t t = vpaddl_u32(vpaddl_u16(
-        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
-    return vreinterpret_m64_u16(
-        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
-}
-
-// Macro: Set the flush zero bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The flush zero may contain any of the
-// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
-FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Macro: Set the rounding mode bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The rounding mode may contain any of
-// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
-// _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
-        break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
-    }
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Set the MXCSR control and status register with the value in unsigned 32-bit
-// integer a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
-// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
-FORCE_INLINE void _mm_setcsr(unsigned int a)
-{
-    _MM_SET_ROUNDING_MODE(a);
-}
-
-// Get the unsigned 32-bit value of the MXCSR control and status register.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
-// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
-FORCE_INLINE unsigned int _mm_getcsr()
-{
-    return _MM_GET_ROUNDING_MODE();
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Return vector of type __m128 with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                           \
-    __extension__({                                                        \
-        vreinterpret_m64_s16(vshuffle_s16(                                 \
-            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
-            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
-    })
-#else
-#define _mm_shuffle_pi16(a, imm)                                               \
-    __extension__({                                                            \
-        int16x4_t ret;                                                         \
-        ret =                                                                  \
-            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
-            1);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
-            2);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
-            3);                                                                \
-        vreinterpret_m64_s16(ret);                                             \
-    })
-#endif
-
-// Perform a serializing operation on all store-to-memory instructions that were
-// issued prior to this instruction. Guarantees that every store instruction
-// that precedes, in program order, is globally visible before any store
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
-FORCE_INLINE void _mm_sfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory and store-to-memory
-// instructions that were issued prior to this instruction. Guarantees that
-// every memory access that precedes, in program order, the memory fence
-// instruction is globally visible before any memory instruction which follows
-// the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
-FORCE_INLINE void _mm_mfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory instructions that
-// were issued prior to this instruction. Guarantees that every load instruction
-// that precedes, in program order, is globally visible before any load
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
-FORCE_INLINE void _mm_lfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_ps(a, b, imm)                                              \
-    __extension__({                                                            \
-        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
-        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
-        float32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-        vreinterpretq_m128_f32(_shuf);                                         \
-    })
-#else  // generic
-#define _mm_shuffle_ps(a, b, imm)                          \
-    __extension__({                                        \
-        __m128 ret;                                        \
-        switch (imm) {                                     \
-        case _MM_SHUFFLE(1, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_1032((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 3, 0, 1):                      \
-            ret = _mm_shuffle_ps_2301((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 3, 2, 1):                      \
-            ret = _mm_shuffle_ps_0321((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 1, 0, 3):                      \
-            ret = _mm_shuffle_ps_2103((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 1, 0):                      \
-            ret = _mm_movelh_ps((a), (b));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_1001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 1, 0, 1):                      \
-            ret = _mm_shuffle_ps_0101((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 1, 0):                      \
-            ret = _mm_shuffle_ps_3210((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 1, 1):                      \
-            ret = _mm_shuffle_ps_0011((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 2, 2):                      \
-            ret = _mm_shuffle_ps_0022((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 2, 0, 0):                      \
-            ret = _mm_shuffle_ps_2200((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 0, 2):                      \
-            ret = _mm_shuffle_ps_3202((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 3, 2):                      \
-            ret = _mm_movehl_ps((b), (a));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 1, 3, 3):                      \
-            ret = _mm_shuffle_ps_1133((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 1, 0):                      \
-            ret = _mm_shuffle_ps_2010((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_2001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_2032((a), (b));           \
-            break;                                         \
-        default:                                           \
-            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-            break;                                         \
-        }                                                  \
-        ret;                                               \
-    })
-#endif
-
-// Compute the square root of packed single-precision (32-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if SSE2NEON_PRECISE_SQRT
-    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-
-    // Test for vrsqrteq_f32(0) -> positive infinity case.
-    // Change to zero, so that s * 1/sqrt(s) result is zero too.
-    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
-    const uint32x4_t div_by_zero =
-        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
-    recip = vreinterpretq_f32_u32(
-        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
-
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-
-    // sqrt(s) = s * 1/sqrt(s)
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
-#elif defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Compute the square root of the lower single-precision (32-bit) floating-point
-// element in a, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
-FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    vst1q_f32(p, vdupq_n_f32(a0));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
-#define _mm_store1_ps _mm_store_ps1
-
-// Store the upper 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_high_f32(a));
-}
-
-// Store the lower 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_low_f32(a));
-}
-
-// Store 4 single-precision (32-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
-FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
-{
-    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-    float32x4_t rev = vextq_f32(tmp, tmp, 2);
-    vst1q_f32(p, rev);
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores 16-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
-FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
-{
-    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
-}
-
-// Stores 64-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
-FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
-{
-    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
-}
-
-// Store 64-bits of integer data from a into memory using a non-temporal memory
-// hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
-FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
-{
-    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#else
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-#endif
-}
-
-// Subtract packed single-precision (32-bit) floating-point elements in b from
-// packed single-precision (32-bit) floating-point elements in a, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_sub_ps(a, b));
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-    do {                                                  \
-        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
-        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
-                            vget_low_f32(ROW23.val[0]));  \
-        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
-                            vget_low_f32(ROW23.val[1]));  \
-        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
-                            vget_high_f32(ROW23.val[0])); \
-        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
-                            vget_high_f32(ROW23.val[1])); \
-    } while (0)
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-// Return vector of type __m128i with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
-FORCE_INLINE __m128i _mm_undefined_si128(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128i a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Return vector of type __m128 with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
-FORCE_INLINE __m128 _mm_undefined_ps(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128 a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the high half a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-/* SSE2 */
-
-// Add packed 16-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed 32-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Add packed 64-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Add packed 8-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add the lower double-precision (64-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper element from
-// a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
-FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_add_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add 64-bit integers a and b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Add packed signed 16-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed unsigned 16-bit integers in a and b using saturation, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Add packed unsigned 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-{
-    // *NOTE* argument swap
-    return vreinterpretq_m128d_s64(
-        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
-}
-
-// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
-// AND with b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vbicq_s32(vreinterpretq_s32_m128i(b),
-                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
-#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
-#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
-
-// Cast vector of type __m128d to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
-FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
-{
-    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
-    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
-{
-    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
-FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
-#else
-    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
-#endif
-}
-
-// Cast vector of type __m128i to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Invalidate and flush the cache line that contains p from all levels of the
-// cache hierarchy.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
-#if defined(__APPLE__)
-#include <libkern/OSCacheControl.h>
-#endif
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-    (void) p;
-
-    /* sys_icache_invalidate is supported since macOS 10.5.
-     * However, it does not work on non-jailbroken iOS devices, although the
-     * compilation is successful.
-     */
-#if defined(__APPLE__)
-    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
-#elif defined(__GNUC__) || defined(__clang__)
-    uintptr_t ptr = (uintptr_t) p;
-    __builtin___clear_cache((char *) ptr,
-                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-// Compare packed 16-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 8-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
-FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
-FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
-FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
-FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
-FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
-FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
-FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
-FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmple_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
-FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
-FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
-FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
-FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
-FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
-FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
-FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
-FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
-FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
-FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
-FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
-FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
-FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Excluding NaNs, any two floating point numbers can be compared.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
-FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
-FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Two NaNs are not equal in comparison operation.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_s32(
-        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
-FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
-FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
-FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 > *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
-FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 <= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
-FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 < *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
-FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
-#else
-    uint32x4_t a_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
-    uint32x4_t b_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
-                                       vreinterpretq_u64_u32(a_eq_b));
-    return vgetq_lane_u64(and_results, 0) & 0x1;
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
-FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
-{
-    return !_mm_comieq_sd(a, b);
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
-FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
-#else
-    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
-FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
-{
-// vrnd32xq_f64 not supported on clang
-#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
-    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
-    int64x2_t integers = vcvtq_s64_f64(rounded);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
-FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
-{
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-{
-#if defined(__aarch64__)
-    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
-    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
-#else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
-FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
-#else
-    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
-    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__ARM_FEATURE_FRINT)
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST:
-        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-    case _MM_ROUND_DOWN:
-        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
-    case _MM_ROUND_UP:
-        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
-    }
-#else
-    float *f = (float *) &a;
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST: {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128i_s32(
-            vbslq_s32(is_delta_half, r_even, r_normal));
-    }
-    case _MM_ROUND_DOWN:
-        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
-                             floorf(f[0]));
-    case _MM_ROUND_UP:
-        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
-                             ceilf(f[0]));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
-                             (int32_t) f[0]);
-    }
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
-#else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
-#else
-    return ((double *) &a)[0];
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
-FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int32_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
-FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
-#define _mm_cvtsd_si64x _mm_cvtsd_si64
-
-// Convert the lower double-precision (64-bit) floating-point element in b to a
-// single-precision (32-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
-FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsetq_lane_f32(
-        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
-        vreinterpretq_f32_m128(a), 0));
-#else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
-#endif
-}
-
-// Copy the lower 32-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
-{
-    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Convert the signed 32-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
-FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Copy 32-bit integer a to the lower elements of dst, and zero the upper
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
-FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
-    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
-#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
-#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
-
-// Convert the lower single-precision (32-bit) floating-point element in b to a
-// double-precision (64-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
-FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
-{
-    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
-FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
-FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
-FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
-{
-    double ret = *((double *) &a);
-    return (int32_t) ret;
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
-FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
-#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
-// Divide packed double-precision (64-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
-FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Divide the lower double-precision (64-bit) floating-point element in a by the
-// lower double-precision (64-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper element from a to the upper
-// element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
-FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    float64x2_t tmp =
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
-#else
-    return _mm_move_sd(a, _mm_div_pd(a, b));
-#endif
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64(p));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
-#else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
-#endif
-}
-
-// Load 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
-    /* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
-#else
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-#endif
-}
-
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
-#if defined(__aarch64__)
-    float64x2_t v = vld1q_f64(p);
-    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
-#else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
-    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
-#endif
-}
-
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
-{
-    return _mm_load_pd(p);
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load unaligned 32-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
-{
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
-// 32-bit integers, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
-    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                              vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__)
-    int32x4_t high =
-        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
-
-    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
-#else
-    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                               vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-#endif
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint. mem_addr does not need to be aligned
-// on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
-FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
-{
-    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x16_t masked =
-        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
-                 vreinterpretq_s8_m128(b));
-    vst1q_s8((int8_t *) mem_addr, masked);
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed maximum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
-FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
-FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_max_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed minimum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
-FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
-FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_min_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
-}
-
-// Move the lower double-precision (64-bit) floating-point element from b to the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
-FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-    // Shift out everything but the sign bits with an unsigned shift right.
-    //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
-    //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
-    //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
-    //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
-    //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
-    //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed double-precision (64-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
-FORCE_INLINE int _mm_movemask_pd(__m128d a)
-{
-    uint64x2_t input = vreinterpretq_u64_m128d(a);
-    uint64x2_t high_bits = vshrq_n_u64(input, 63);
-    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-{
-    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-}
-
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
-    // vmull_u32 upcasts instead of masking, so we downcast.
-    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply packed double-precision (64-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
-FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Multiply the lower double-precision (64-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper element
-// from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
-FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_mul_pd(a, b));
-}
-
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u64(vget_low_u64(
-        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
-}
-
-// Multiply the packed signed 16-bit integers in a and b, producing intermediate
-// 32-bit integers, and store the high 16 bits of the intermediate integers in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-    /* FIXME: issue with large values because of result saturation */
-    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
-FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
-{
-    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__)
-    uint32x4_t ab7654 =
-        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
-                              vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r);
-#else
-    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-#endif
-}
-
-// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
-// integers, and store the low 16 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compute the bitwise OR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
-FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Pause the processor. This is typically used in spin-wait loops and depending
-// on the x86 processor typical values are in the 40-100 cycle range. The
-// 'yield' instruction isn't a good fit because it's effectively a nop on most
-// Arm cores. Experience with several databases has shown has shown an 'isb' is
-// a reasonable approximation.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
-FORCE_INLINE void _mm_pause()
-{
-    __asm__ __volatile__("isb\n");
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
-}
-
-// Set packed 16-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                   short i6,
-                                   short i5,
-                                   short i4,
-                                   short i3,
-                                   short i2,
-                                   short i1,
-                                   short i0)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-    return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
-{
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
-}
-
-// Set packed 8-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                  signed char b14,
-                                  signed char b13,
-                                  signed char b12,
-                                  signed char b11,
-                                  signed char b10,
-                                  signed char b9,
-                                  signed char b8,
-                                  signed char b7,
-                                  signed char b6,
-                                  signed char b5,
-                                  signed char b4,
-                                  signed char b3,
-                                  signed char b2,
-                                  signed char b1,
-                                  signed char b0)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-{
-    double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
-#define _mm_set_pd1 _mm_set1_pd
-
-// Copy double-precision (64-bit) floating-point element a to the lower element
-// of dst, and zero the upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
-FORCE_INLINE __m128d _mm_set_sd(double a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
-#else
-    return _mm_set_pd(0, a);
-#endif
-}
-
-// Broadcast 16-bit integer a to all all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Broadcast 32-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Broadcast 8-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
-    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
-FORCE_INLINE __m128d _mm_set1_pd(double d)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
-#endif
-}
-
-// Set packed 16-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                    short w1,
-                                    short w2,
-                                    short w3,
-                                    short w4,
-                                    short w5,
-                                    short w6,
-                                    short w7)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
-{
-    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
-}
-
-// Set packed 8-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                   signed char b1,
-                                   signed char b2,
-                                   signed char b3,
-                                   signed char b4,
-                                   signed char b5,
-                                   signed char b6,
-                                   signed char b7,
-                                   signed char b8,
-                                   signed char b9,
-                                   signed char b10,
-                                   signed char b11,
-                                   signed char b12,
-                                   signed char b13,
-                                   signed char b14,
-                                   signed char b15)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
-FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
-{
-    return _mm_set_pd(e0, e1);
-}
-
-// Return vector of type __m128d with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
-FORCE_INLINE __m128d _mm_setzero_pd(void)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
-#else
-    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
-#endif
-}
-
-// Return vector of type __m128i with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Shuffle 32-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_epi32(a, imm)                                            \
-    __extension__({                                                          \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
-        int32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                                      \
-    })
-#else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
-#endif
-
-// Shuffle double-precision (64-bit) floating-point elements using the control
-// in imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pd(a, b, imm8)                                            \
-    vreinterpretq_m128d_s64(                                                  \
-        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
-#else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflehi_epi16(a, imm)                                           \
-    __extension__({                                                           \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
-        int16x8_t _shuf =                                                     \
-            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-                          (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                                       \
-    })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflelo_epi16(a, imm)                                  \
-    __extension__({                                                  \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = vshuffleq_s16(                             \
-            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
-            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
-        vreinterpretq_m128i_s16(_shuf);                              \
-    })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shift packed 16-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
-    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
-    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
-    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
-FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~15))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
-}
-
-// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~31))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s32(
-        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
-}
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~63))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s64(
-        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                         \
-    __extension__({                                                    \
-        int8x16_t ret;                                                 \
-        if (_sse2neon_unlikely(imm == 0))                              \
-            ret = vreinterpretq_s8_m128i(a);                           \
-        else if (_sse2neon_unlikely((imm) & ~15))                      \
-            ret = vdupq_n_s8(0);                                       \
-        else                                                           \
-            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
-                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
-        vreinterpretq_m128i_s8(ret);                                   \
-    })
-
-// Compute the square root of packed double-precision (64-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
-FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Compute the square root of the lower double-precision (64-bit) floating-point
-// element in b, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
-FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_sqrt_pd(b));
-#else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
-#endif
-}
-
-// Shift packed 16-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in sign
-// bits, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-{
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
-}
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) == 0)) {                                \
-            ret = a;                                                         \
-        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 16-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
-    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
-    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
-    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~15)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u16(                                   \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~31)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u32(                                   \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~63)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u64(                                   \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
-#define _mm_srli_si128(a, imm)                                       \
-    __extension__({                                                  \
-        int8x16_t ret;                                               \
-        if (_sse2neon_unlikely((imm) & ~15))                         \
-            ret = vdupq_n_s8(0);                                     \
-        else                                                         \
-            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
-                           (imm > 15 ? 0 : imm));                    \
-        vreinterpretq_m128i_s8(ret);                                 \
-    })
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
-FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
-    vst1q_f64((float64_t *) mem_addr,
-              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
-#else
-    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
-    vst1q_f32((float32_t *) mem_addr,
-              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
-FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
-#define _mm_store1_pd _mm_store_pd1
-
-// Store the upper double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
-FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 64-bit integer from the first element of a into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
-    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
-FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 2 double-precision (64-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
-FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
-{
-    float32x4_t f = vreinterpretq_f32_m128d(a);
-    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
-{
-    _mm_store_pd(mem_addr, a);
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store 32-bit integer from the first element of a into memory. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
-FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
-{
-    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory using a non-temporal memory hint. mem_addr must
-// be aligned on a 16-byte boundary or a general-protection exception may be
-// generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
-FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#elif defined(__aarch64__)
-    vst1q_f64(p, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory using a non-temporal memory
-// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
-// exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, p);
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Store 32-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
-FORCE_INLINE void _mm_stream_si32(int *p, int a)
-{
-    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
-}
-
-// Store 64-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
-FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
-{
-    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
-}
-
-// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed double-precision (64-bit) floating-point elements in b from
-// packed double-precision (64-bit) floating-point elements in a, and store the
-// results in dst.
-//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
-FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Subtract the lower double-precision (64-bit) floating-point element in b from
-// the lower double-precision (64-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
-FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_sub_pd(a, b));
-}
-
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-#define _mm_ucomieq_sd _mm_comieq_sd
-#define _mm_ucomige_sd _mm_comige_sd
-#define _mm_ucomigt_sd _mm_comigt_sd
-#define _mm_ucomile_sd _mm_comile_sd
-#define _mm_ucomilt_sd _mm_comilt_sd
-#define _mm_ucomineq_sd _mm_comineq_sd
-
-// Return vector of type __m128d with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
-FORCE_INLINE __m128d _mm_undefined_pd(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128d a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the high half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the high half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
-FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
-                     vget_high_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
-FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
-                     vget_low_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-/* SSE3 */
-
-// Alternatively add and subtract packed double-precision (64-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
-FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
-{
-    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
-                                             vreinterpretq_f64_m128d(b),
-                                             vreinterpretq_f64_m128d(mask)));
-#else
-    return _mm_add_pd(_mm_mul_pd(b, mask), a);
-#endif
-}
-
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-{
-    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
-                                            vreinterpretq_f32_m128(mask),
-                                            vreinterpretq_f32_m128(b)));
-#else
-    return _mm_add_ps(_mm_mul_ps(b, mask), a);
-#endif
-}
-
-// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
-FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of double-precision (64-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
-{
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
-#else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
-{
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
-#else
-    float32x4x2_t c = vuzpq_f32(a, b);
-    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
-}
-
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
-
-// Duplicate the low double-precision (64-bit) floating-point element from a,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
-FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_u64(
-        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
-#endif
-}
-
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
-#else
-    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
-    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
-    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-/* SSSE3 */
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
-    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
-    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-{
-    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-{
-    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
-    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
-
-// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 16 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
-        __m128i ret;                                                          \
-        if (_sse2neon_unlikely((imm) & ~31))                                  \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
-        else if (imm >= 16)                                                   \
-            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
-        else                                                                  \
-            ret =                                                             \
-                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
-        ret;                                                                  \
-    })
-
-// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 8 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
-#define _mm_alignr_pi8(a, b, imm)                                           \
-    __extension__({                                                         \
-        __m64 ret;                                                          \
-        if (_sse2neon_unlikely((imm) >= 16)) {                              \
-            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
-        } else {                                                            \
-            uint8x8_t tmp_low, tmp_high;                                    \
-            if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
-                tmp_low = vreinterpret_u8_m64(a);                           \
-                tmp_high = vdup_n_u8(0);                                    \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            } else {                                                        \
-                const int idx = (imm);                                      \
-                tmp_low = vreinterpret_u8_m64(b);                           \
-                tmp_high = vreinterpret_u8_m64(a);                          \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            }                                                               \
-        }                                                                   \
-        ret;                                                                \
-    })
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
-#else
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s32(
-        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
-FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t res = vuzp_s16(a, b);
-    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
-#else
-    int32x4x2_t c = vuzpq_s32(a, b);
-    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
-FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
-FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
-#else
-    int32x2x2_t c = vuzp_s32(a, b);
-    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
-FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                             vmovl_s8(vget_low_s8(b)));
-    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
-                             vmovl_s8(vget_high_s8(b)));
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
-#else
-    // This would be much simpler if x86 would choose to zero extend OR sign
-    // extend, not both. This could probably be optimized better.
-    uint16x8_t a = vreinterpretq_u16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // Zero extend a
-    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-    int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-    // multiply
-    int16x8_t prod1 = vmulq_s16(a_even, b_even);
-    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
-// pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
-FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
-{
-    uint16x4_t a = vreinterpret_u16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // Zero extend a
-    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
-    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
-    int16x4_t b_odd = vshr_n_s16(b, 8);
-
-    // multiply
-    int16x4_t prod1 = vmul_s16(a_even, b_even);
-    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
-    // Has issues due to saturation
-    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-    // Multiply
-    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    // Rounding narrowing shift right
-    // narrow = (int16_t)((mul + 16384) >> 15);
-    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-    // Join together
-    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Truncate each intermediate integer to the 18 most
-// significant bits, round by adding 1, and store bits [16:1] to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
-FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
-{
-    int32x4_t mul_extend =
-        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
-
-    // Rounding narrowing shift right
-    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
-    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
-    uint8x16_t idx_masked =
-        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-    int8x16_t ret;
-    // %e and %f represent the even and odd D registers
-    // respectively.
-    __asm__ __volatile__(
-        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-        : [ret] "=&w"(ret)
-        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-    return vreinterpretq_m128i_s8(ret);
-#else
-    // use this line if testing on aarch64
-    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
-FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
-{
-    const int8x8_t controlMask =
-        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
-    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
-    return vreinterpret_m64_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
-#else
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
-    // 'a') based on ltMask
-    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x8_t res = vbicq_s16(masked, zeroMask);
-    return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
-#else
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
-    // 'a') based on ltMask
-    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x4_t res = vbicq_s32(masked, zeroMask);
-    return vreinterpretq_m128i_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
-    int8x16_t a = vreinterpretq_s8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
-#else
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
-    // based on ltMask
-    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x16_t res = vbicq_s8(masked, zeroMask);
-
-    return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
-
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
-#else
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
-    // based on ltMask
-    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x4_t res = vbic_s16(masked, zeroMask);
-
-    return vreinterpret_m64_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
-#else
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
-    // based on ltMask
-    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x2_t res = vbic_s32(masked, zeroMask);
-
-    return vreinterpret_m64_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-{
-    int8x8_t a = vreinterpret_s8_m64(_a);
-    int8x8_t b = vreinterpret_s8_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
-#else
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
-    // based on ltMask
-    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x8_t res = vbic_s8(masked, zeroMask);
-
-    return vreinterpret_m64_s8(res);
-}
-
-/* SSE4.1 */
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                            \
-    __extension__({                                                           \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
-    })
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using control mask imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
-#define _mm_blend_pd(a, b, imm)                                \
-    __extension__({                                            \
-        const uint64_t _mask[2] = {                            \
-            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
-            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
-        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
-        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
-        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
-        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
-    })
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
-FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
-{
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
-    uint32x4_t mask = vld1q_u32(data);
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint8x16_t mask =
-        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    uint8x16_t b = vreinterpretq_u8_m128i(_b);
-    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
-FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
-{
-    uint64x2_t mask =
-        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
-#else
-    uint64x2_t a = vreinterpretq_u64_m128d(_a);
-    uint64x2_t b = vreinterpretq_u64_m128d(_b);
-    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
-#endif
-}
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint32x4_t mask =
-        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a up
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b up to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
-FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_ceil_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b up to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
-FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_ceil_ps(b));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
-    // ARMv7 lacks vceqq_u64
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
-    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
-// integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_u32(
-        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
-    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_u64(
-        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
-    return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
-// 64-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Conditionally multiply the packed double-precision (64-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products, and
-// conditionally store the sum in dst using the low 4 bits of imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
-FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
-{
-    // Generate mask value from constant immediate bit value
-    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
-    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
-#if !SSE2NEON_PRECISE_DP
-    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
-    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
-#endif
-    // Conditional multiplication
-#if !SSE2NEON_PRECISE_DP
-    __m128d mul = _mm_mul_pd(a, b);
-    const __m128d mulMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
-    __m128d tmp = _mm_and_pd(mul, mulMask);
-#else
-#if defined(__aarch64__)
-    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
-                             : 0;
-    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
-                             : 0;
-#else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
-#endif
-    __m128d tmp = _mm_set_pd(d1, d0);
-#endif
-    // Sum the products
-#if defined(__aarch64__)
-    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
-#else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
-#endif
-    // Conditionally store the sum
-    const __m128d sumMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
-    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
-    return res;
-}
-
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-{
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
-    if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
-    }
-#endif
-
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
-
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
-    if (imm & (1 << 4))
-        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
-    if (imm & (1 << 5))
-        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
-    if (imm & (1 << 6))
-        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
-    if (imm & (1 << 7))
-        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
-    };
-    return vreinterpretq_m128_f32(res);
-}
-
-// Extract a 32-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extract a 64-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Extract an 8-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
-// __constrange(0,16) int imm)
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Round the packed double-precision (64-bit) floating-point elements in a down
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
-FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b down to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
-FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_floor_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b down to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
-FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_floor_ps(b));
-}
-
-// Copy a to dst, and insert the 32-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the 64-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
-// location specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
-
-// Copy a to tmp, then insert a single-precision (32-bit) floating-point
-// element from b into tmp using the control in imm8. Store tmp to dst using
-// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                              \
-    __extension__({                                                            \
-        float32x4_t tmp1 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
-                           vreinterpretq_f32_m128(a), 0);                      \
-        float32x4_t tmp2 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
-                           ((imm8 >> 4) & 0x3));                               \
-        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
-        uint32x4_t mask = vld1q_u32(data);                                     \
-        float32x4_t all_zeros = vdupq_n_f32(0);                                \
-                                                                               \
-        vreinterpretq_m128_f32(                                                \
-            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
-    })
-
-// Compare packed signed 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
-FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
-FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
-FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
-    __m128i dst;
-    uint16_t min, idx = 0;
-#if defined(__aarch64__)
-    // Find the minimum value
-    min = vminvq_u16(vreinterpretq_u16_m128i(a));
-
-    // Get the index of the minimum value
-    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint16x8_t minv = vdupq_n_u16(min);
-    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
-    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
-#else
-    // Find the minimum value
-    __m64 tmp;
-    tmp = vreinterpret_m64_u16(
-        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-                 vget_high_u16(vreinterpretq_u16_m128i(a))));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-    // Get the index of the minimum value
-    int i;
-    for (i = 0; i < 8; i++) {
-        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-            idx = (uint16_t) i;
-            break;
-        }
-        a = _mm_srli_si128(a, 2);
-    }
-#endif
-    // Generate result
-    dst = _mm_setzero_si128();
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-    return dst;
-}
-
-// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
-// 8-bit integers in a compared to those in b, and store the 16-bit results in
-// dst. Eight SADs are performed using one quadruplet from b and eight
-// quadruplets from a. One quadruplet is selected from b starting at on the
-// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
-// integers selected from a starting at the offset specified in imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
-FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
-{
-    uint8x16_t _a, _b;
-
-    switch (imm & 0x4) {
-    case 0:
-        // do nothing
-        _a = vreinterpretq_u8_m128i(a);
-        break;
-    case 4:
-        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
-                                            vreinterpretq_u32_m128i(a), 1));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    switch (imm & 0x3) {
-    case 0:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
-        break;
-    case 1:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
-        break;
-    case 2:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
-        break;
-    case 3:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    int16x8_t c04, c15, c26, c37;
-    uint8x8_t low_b = vget_low_u8(_b);
-    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
-    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
-    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
-    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
-    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
-    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
-    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__)
-    // |0|4|2|6|
-    c04 = vpaddq_s16(c04, c26);
-    // |1|5|3|7|
-    c15 = vpaddq_s16(c15, c37);
-
-    int32x4_t trn1_c =
-        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    int32x4_t trn2_c =
-        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
-                                              vreinterpretq_s16_s32(trn2_c)));
-#else
-    int16x4_t c01, c23, c45, c67;
-    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
-    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
-    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
-    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
-
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
-#endif
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
-    // vmull_s32 upcasts instead of masking, so we downcast.
-    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
-// integers, and store the low 32 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
-FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
-{
-#if defined(__aarch64__)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_pd(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_pd(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
-    }
-#else
-    double *v_double = (double *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        double res[2], tmp;
-        for (int i = 0; i < 2; i++) {
-            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
-            double roundDown = floor(tmp);  // Round down value
-            double roundUp = ceil(tmp);     // Round up value
-            double diffDown = tmp - roundDown;
-            double diffUp = roundUp - tmp;
-            if (diffDown < diffUp) {
-                /* If it's closer to the round down value, then use it */
-                res[i] = roundDown;
-            } else if (diffDown > diffUp) {
-                /* If it's closer to the round up value, then use it */
-                res[i] = roundUp;
-            } else {
-                /* If it's equidistant between round up and round down value,
-                 * pick the one which is an even number */
-                double half = roundDown / 2;
-                if (half != floor(half)) {
-                    /* If the round down value is odd, return the round up value
-                     */
-                    res[i] = roundUp;
-                } else {
-                    /* If the round up value is odd, return the round down value
-                     */
-                    res[i] = roundDown;
-                }
-            }
-            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
-        }
-        return _mm_set_pd(res[1], res[0]);
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_pd(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_pd(a);
-    }
-    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
-                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
-// floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_ps(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_ps(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
-    }
-#else
-    float *v_float = (float *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128_f32(
-            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_ps(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_ps(a);
-    }
-    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
-                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
-                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
-                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b using
-// the rounding parameter, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
-FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
-{
-    return _mm_move_sd(a, _mm_round_pd(b, rounding));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b using
-// the rounding parameter, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst. Rounding is done according to the
-// rounding[3:0] parameter, which can be one of:
-//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
-//     exceptions
-//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
-//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
-//     _MM_SET_ROUNDING_MODE
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
-FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
-{
-    return _mm_move_ss(a, _mm_round_ps(b, rounding));
-}
-
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    return __builtin_nontemporal_load(p);
-#else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
-#endif
-}
-
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
-{
-    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
-    int64x2_t a_and_mask =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
-    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
-// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
-// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
-FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
-{
-    uint64x2_t zf =
-        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t cf =
-        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t result = vandq_u64(zf, cf);
-    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the CF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
-FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
-#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the ZF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
-FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-/* SSE4.2 */
-
-const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-
-/* specify the source data format */
-#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
-#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
-#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
-#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
-
-/* specify the comparison operation */
-#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
-#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
-#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
-#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
-
-/* specify the polarity */
-#define _SIDD_POSITIVE_POLARITY 0x00
-#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
-#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
-#define _SIDD_MASKED_NEGATIVE_POLARITY \
-    0x30 /* negate results only before end of string */
-
-/* specify the output selection in _mm_cmpXstri */
-#define _SIDD_LEAST_SIGNIFICANT 0x00
-#define _SIDD_MOST_SIGNIFICANT 0x40
-
-/* specify the output selection in _mm_cmpXstrm */
-#define _SIDD_BIT_MASK 0x00
-#define _SIDD_UNIT_MASK 0x40
-
-/* Pattern Matching for C macros.
- * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
- */
-
-/* catenate */
-#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
-#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
-
-#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
-/* run the 2nd parameter */
-#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
-/* run the 1st parameter */
-#define SSE2NEON_IIF_1(t, ...) t
-
-#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
-#define SSE2NEON_COMPL_0 1
-#define SSE2NEON_COMPL_1 0
-
-#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
-#define SSE2NEON_DEC_1 0
-#define SSE2NEON_DEC_2 1
-#define SSE2NEON_DEC_3 2
-#define SSE2NEON_DEC_4 3
-#define SSE2NEON_DEC_5 4
-#define SSE2NEON_DEC_6 5
-#define SSE2NEON_DEC_7 6
-#define SSE2NEON_DEC_8 7
-#define SSE2NEON_DEC_9 8
-#define SSE2NEON_DEC_10 9
-#define SSE2NEON_DEC_11 10
-#define SSE2NEON_DEC_12 11
-#define SSE2NEON_DEC_13 12
-#define SSE2NEON_DEC_14 13
-#define SSE2NEON_DEC_15 14
-#define SSE2NEON_DEC_16 15
-
-/* detection */
-#define SSE2NEON_CHECK_N(x, n, ...) n
-#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
-#define SSE2NEON_PROBE(x) x, 1,
-
-#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
-#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
-
-#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
-#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
-
-#define SSE2NEON_EAT(...)
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
-
-/* recursion */
-/* deferred expression */
-#define SSE2NEON_EMPTY()
-#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
-#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-
-#define SSE2NEON_EVAL(...) \
-    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
-#define SSE2NEON_EVAL1(...) \
-    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
-#define SSE2NEON_EVAL2(...) \
-    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
-#define SSE2NEON_EVAL3(...) __VA_ARGS__
-
-#define SSE2NEON_REPEAT(count, macro, ...)                         \
-    SSE2NEON_WHEN(count)                                           \
-    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
-        SSE2NEON_DEC(count), macro,                                \
-        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
-                                              __VA_ARGS__))
-#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
-
-#define SSE2NEON_SIZE_OF_byte 8
-#define SSE2NEON_NUMBER_OF_LANES_byte 16
-#define SSE2NEON_SIZE_OF_word 16
-#define SSE2NEON_NUMBER_OF_LANES_word 8
-
-#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
-    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
-        vreinterpretq_##type##_m128i(a)));
-
-#define SSE2NEON_FILL_LANE(i, type) \
-    vec_b[i] =                      \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
-
-#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
-                       number_of_lanes, byte_or_word)                         \
-    do {                                                                      \
-        SSE2NEON_CAT(                                                         \
-            data_type_prefix,                                                 \
-            SSE2NEON_CAT(size,                                                \
-                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
-        vec_b[number_of_lanes];                                               \
-        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
-            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
-            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
-                                      SSE2NEON_CAT(type_prefix, size)))       \
-        for (int i = 0; i < number_of_lanes; i++) {                           \
-            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
-                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
-                SSE2NEON_CAT(vreinterpretq_u,                                 \
-                             SSE2NEON_CAT(size, _m128i))(mask),               \
-                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a))))),        \
-                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
-        }                                                                     \
-    } while (0)
-
-#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
-    do {                                                                     \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
-                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
-                                      SSE2NEON_CAT(u, size)))                \
-    } while (0)
-
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
-    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
-                                                int lb)                       \
-    {                                                                         \
-        __m128i mtx[16];                                                      \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
-        return SSE2NEON_CAT(                                                  \
-            _sse2neon_aggregate_equal_any_,                                   \
-            SSE2NEON_CAT(                                                     \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
-                                             type))))(la, lb, mtx);           \
-    }
-
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
-    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
-                                                 int lb)                       \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_RANGES(                                                        \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_ranges_,                                       \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
-                                             type))))(la, lb, mtx);            \
-    }
-
-#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
-                                                    __m128i b, int lb)         \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_equal_ordered_,                                \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x,                                                \
-                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
-    }
-
-static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
-    prefix##IMPL(byte) \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
-
-static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        __m128i tmp = vreinterpretq_m128i_u32(
-            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
-        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
-                                       vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__)
-        int t = vaddvq_u32(vec_res) ? 1 : 0;
-#else
-        uint64x2_t sumh = vpaddlq_u32(vec_res);
-        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
-#endif
-        res |= (t << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        __m128i tmp = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
-        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
-                                       vreinterpretq_u16_m128i(tmp));
-        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
-        res |= (t << j);
-    }
-    return res;
-}
-
-#define SSE2NEON_CMP_RANGES_IS_BYTE 1
-#define SSE2NEON_CMP_RANGES_IS_WORD 0
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
-    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
-    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
-    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
-    prefix##IMPL(word, int, s, prefix##IS_WORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
-
-#undef SSE2NEON_CMP_RANGES_IS_BYTE
-#undef SSE2NEON_CMP_RANGES_IS_WORD
-
-static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint8x16_t mtx =
-        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x10000 - (1 << la);
-    int tb = 0x10000 - (1 << lb);
-    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
-    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
-    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
-
-    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
-    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
-    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
-    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
-    res_lo = vand_u8(res_lo, vec_mask);
-    res_hi = vand_u8(res_hi, vec_mask);
-
-    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
-    return res;
-}
-
-static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint16x8_t mtx =
-        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x100 - (1 << la);
-    int tb = 0x100 - (1 << lb);
-    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
-    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
-    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
-    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
-    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
-    mtx = vbslq_u16(vec1, tmp, mtx);
-    mtx = vandq_u16(mtx, vec_mask);
-    return _sse2neon_vaddvq_u16(mtx);
-}
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
-        int bound, int la, int lb, __m128i mtx[16])                            \
-    {                                                                          \
-        int res = 0;                                                           \
-        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
-        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
-            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
-            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
-        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
-                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
-            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
-        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
-        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
-        for (int j = 0; j < lb; j++) {                                         \
-            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
-                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
-        }                                                                      \
-        for (int j = lb; j < bound; j++) {                                     \
-            mtx[j] = vreinterpretq_m128i_u##size(                              \
-                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
-        }                                                                      \
-        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
-            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
-        for (int i = 0; i < bound; i++) {                                      \
-            int val = 1;                                                       \
-            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
-                val &= ptr[k * bound + j];                                     \
-            res += val << i;                                                   \
-        }                                                                      \
-        return res;                                                            \
-    }
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
-    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
-    prefix##IMPL(16, 8, prefix##IS_UWORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
-
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
-    prefix##IMPL(byte)                              \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
-
-#define SSE2NEON_CMPESTR_LIST                          \
-    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
-    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
-    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
-    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
-    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
-    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
-
-enum {
-#define _(name, func_suffix) name,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
-static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
-#define _(name, func_suffix) _sse2neon_##func_suffix,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-
-FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
-{
-    switch (imm8 & 0x30) {
-    case _SIDD_NEGATIVE_POLARITY:
-        res ^= 0xffffffff;
-        break;
-    case _SIDD_MASKED_NEGATIVE_POLARITY:
-        res ^= (1 << lb) - 1;
-        break;
-    default:
-        break;
-    }
-
-    return res & ((bound == 8) ? 0xFF : 0xFFFF);
-}
-
-FORCE_INLINE int _sse2neon_clz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanForward(&cnt, x))
-        return cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_clz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanReverse(&cnt, x))
-        return 31 - cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_ctz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
-{
-#if _MSC_VER
-    unsigned long cnt;
-#ifdef defined(SSE2NEON_HAS_BITSCAN64)
-    (defined(_M_AMD64) || defined(__x86_64__))
-        if((_BitScanForward64(&cnt, x))
-            return (int)(cnt);
-#else
-    if (_BitScanForward(&cnt, (unsigned long) (x)))
-        return (int) cnt;
-    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
-        return (int) (cnt + 32);
-#endif
-    return 64;
-#else
-    return x != 0 ? __builtin_ctzll(x) : 64;
-#endif
-}
-
-#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
-
-#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = (imm & 0x01) ? 8 : 16
-
-#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
-    int tmp1 = la ^ (la >> 31);                  \
-    la = tmp1 - (la >> 31);                      \
-    int tmp2 = lb ^ (lb >> 31);                  \
-    lb = tmp2 - (lb >> 31);                      \
-    la = SSE2NEON_MIN(la, bound);                \
-    lb = SSE2NEON_MIN(lb, bound)
-
-// Compare all pairs of character in string a and b,
-// then aggregate the result.
-// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
-// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
-// string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
-    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
-    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
-
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
-    return (r2 == 0) ? bound                                     \
-                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                      : _sse2neon_ctz(r2))
-
-#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
-    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if (imm8 & 0x40) {                                                         \
-        if (bound == 8) {                                                      \
-            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
-                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
-            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
-                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
-        } else {                                                               \
-            uint8x16_t vec_r2 =                                                \
-                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
-            uint8x16_t tmp =                                                   \
-                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
-        }                                                                      \
-    } else {                                                                   \
-        if (bound == 16) {                                                     \
-            dst = vreinterpretq_m128i_u16(                                     \
-                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
-        } else {                                                               \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
-        }                                                                      \
-    }                                                                          \
-    return dst
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and returns 1 if b did not contain a null character and the
-// resulting mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
-FORCE_INLINE int _mm_cmpestra(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    int lb_cpy = lb;
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return !r2 & (lb_cpy > bound);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
-FORCE_INLINE int _mm_cmpestrc(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
-FORCE_INLINE int _mm_cmpestri(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
-FORCE_INLINE __m128i
-_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
-FORCE_INLINE int _mm_cmpestro(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
-FORCE_INLINE int _mm_cmpestrs(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
-FORCE_INLINE int _mm_cmpestrz(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return lb <= (bound - 1);
-}
-
-#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
-    do {                                                                 \
-        if (imm8 & 0x01) {                                               \
-            uint16x8_t equal_mask_##str =                                \
-                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
-        } else {                                                         \
-            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
-                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
-        }                                                                \
-    } while (0)
-
-#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
-    int la, lb;                                  \
-    do {                                         \
-        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
-        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
-    } while (0)
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if b did not contain a null character and the resulting
-// mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
-FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return !r2 & (lb >= bound);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
-FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
-FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
-FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
-FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
-FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int la;
-    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
-FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int lb;
-    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
-    return lb <= (bound - 1);
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    return vreinterpretq_m128i_s64(vshrq_n_s64(
-        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
-        63));
-#endif
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32ch(crc, v);
-#else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cw(crc, v);
-#else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cb(crc, v);
-#else
-    crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
-#endif
-    return crc;
-}
-
-/* AES */
-
-#if !defined(__ARM_FEATURE_CRYPTO)
-/* clang-format off */
-#define SSE2NEON_AES_SBOX(w)                                           \
-    {                                                                  \
-        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
-        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
-        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
-        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
-        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
-        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
-        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
-        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
-        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
-        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
-        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
-        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
-        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
-        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
-        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
-        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
-        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
-        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
-        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
-        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
-        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
-        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
-        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
-        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
-        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
-        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
-        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
-        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
-        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
-        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
-        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
-        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
-        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
-        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
-        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
-        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
-        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
-    }
-#define SSE2NEON_AES_RSBOX(w)                                          \
-    {                                                                  \
-        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
-        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
-        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
-        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
-        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
-        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
-        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
-        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
-        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
-        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
-        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
-        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
-        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
-        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
-        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
-        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
-        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
-        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
-        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
-        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
-        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
-        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
-        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
-        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
-        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
-        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
-        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
-        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
-        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
-        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
-        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
-        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
-        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
-        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
-        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
-        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
-        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
-    }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2NEON_AES_H0(x) (x)
-static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
-static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
-#undef SSE2NEON_AES_H0
-
-/* x_time function and matrix multiply function */
-#if !defined(__aarch64__)
-#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-#define SSE2NEON_MULTIPLY(x, y)                                  \
-    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
-     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
-     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
-     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
-#endif
-
-// In the absence of crypto extensions, implement aesenc using regular NEON
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// for more information.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    /* shift rows */
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    /* sub bytes */
-    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-    // look up each of the table. After each lookup, we load the next table
-    // which locates at the next 64-bytes. In the meantime, the index in the
-    // table would be smaller than it was, so the index parameters of
-    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    /* mix columns */
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    /* add round key */
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A implementation for a table-based AES */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
-    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
-     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
-// muliplying 'x' by 2 in GF(2^8)
-#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-// muliplying 'x' by 3 in GF(2^8)
-#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
-#define SSE2NEON_AES_U0(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
-#define SSE2NEON_AES_U1(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
-#define SSE2NEON_AES_U2(p) \
-    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
-#define SSE2NEON_AES_U3(p) \
-    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
-
-    // this generates a table containing every possible permutation of
-    // shift_rows() and sub_bytes() with mix_columns().
-    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
-    };
-#undef SSE2NEON_AES_B2W
-#undef SSE2NEON_AES_F2
-#undef SSE2NEON_AES_F3
-#undef SSE2NEON_AES_U0
-#undef SSE2NEON_AES_U1
-#undef SSE2NEON_AES_U2
-#undef SSE2NEON_AES_U3
-
-    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
-    uint32_t x1 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
-    uint32_t x2 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
-    uint32_t x3 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
-
-    // finish the modulo addition step in mix_columns()
-    __m128i out = _mm_set_epi32(
-        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
-         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
-        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
-         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
-        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
-         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
-        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
-         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, RoundKey);
-#endif
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // inverse mix columns
-    // muliplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    // add round key
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t i, e, f, g, h, v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    // inverse mix columns
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    // sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A implementation */
-    uint8_t v[16] = {
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-    };
-
-    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (int i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-#if defined(__aarch64__)
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-    uint8x16_t v = vreinterpretq_u8_m128i(a);
-    uint8x16_t w;
-
-    // multiplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    // multiplying 'v' by 2 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-    return vreinterpretq_m128i_u8(w);
-
-#else /* ARMv7-A NEON implementation */
-    uint8_t i, e, f, g, h, v[4][4];
-    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
-#endif
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-//
-// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
-// This instruction generates a round key for AES encryption. See
-// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
-// for details.
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-#if defined(__aarch64__)
-    uint8x16_t _a = vreinterpretq_u8_m128i(a);
-    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
-
-    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
-    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
-    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
-
-    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
-
-#else /* ARMv7-A NEON implementation */
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
-    for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
-    }
-    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-#endif
-}
-#undef SSE2NEON_AES_SBOX
-#undef SSE2NEON_AES_RSBOX
-
-#if defined(__aarch64__)
-#undef SSE2NEON_XT
-#undef SSE2NEON_MULTIPLY
-#endif
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-        vreinterpretq_u8_m128i(b));
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(veorq_u8(
-        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-        vreinterpretq_u8_m128i(RoundKey)));
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-                         RoundKey);
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(
-        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
-        vreinterpretq_u8_m128i(RoundKey));
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst."
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-    // AESE does ShiftRows and SubBytes on A
-    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-
-    uint8x16_t dest = {
-        // Undo ShiftRows step from AESE and extract X1 and X3
-        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
-        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
-        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
-    };
-    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
-    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
-}
-#endif
-
-/* Others */
-
-// Perform a carry-less multiplication of two 64-bit integers, selected from a
-// and b according to imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
-    uint64x2_t a = vreinterpretq_u64_m128i(_a);
-    uint64x2_t b = vreinterpretq_u64_m128i(_b);
-    switch (imm & 0x11) {
-    case 0x00:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-    case 0x01:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-    case 0x10:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-    case 0x11:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
-    default:
-        abort();
-    }
-}
-
-FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
-}
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(a);
-#else
-    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
-    uint32_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-
-    vst1_u32(&count, count32x2_val);
-    return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(a);
-#else
-    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
-    uint64_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-    uint64x1_t count64x1_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-    count64x1_val = vpaddl_u32(count32x2_val);
-    vst1_u64(&count, count64x1_val);
-    return count;
-#endif
-}
-
-FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Return the current 64-bit value of the processor's time-stamp counter.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
-FORCE_INLINE uint64_t _rdtsc(void)
-{
-#if defined(__aarch64__)
-    uint64_t val;
-
-    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
-     * system counter is at least 56 bits wide; from Armv8.6, the counter
-     * must be 64 bits wide.  So the system counter could be less than 64
-     * bits wide and it is attributed with the flag 'cap_user_time_short'
-     * is true.
-     */
-    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
-
-    return val;
-#else
-    uint32_t pmccntr, pmuseren, pmcntenset;
-    // Read the user mode Performance Monitoring Unit (PMU)
-    // User Enable Register (PMUSERENR) access permissions.
-    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
-    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
-        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
-        if (pmcntenset & 0x80000000UL) {  // Is it counting?
-            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
-            // The counter is set up to count every 64th cycle
-            return (uint64_t) (pmccntr) << 6;
-        }
-    }
-
-    // Fallback to syscall as we can't enable PMUSERENR in user mode.
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
-#endif
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC pop_options
-#endif
-
-#endif
\ No newline at end of file
diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
new file mode 100644
index 0000000000..7dea6740cb
--- /dev/null
+++ b/share/cmake/modules/Findsse2neon.cmake
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Locate sse2neon (header-only version)
+#
+# Variables defined by this module:
+#   sse2neon_FOUND          - Indicate whether the library was found or not
+#   sse2neon_INCLUDE_DIR    - Location of the header files
+#
+# Global targets defined by this module:
+#   sse2neon
+###############################################################################
+### Try to find package ###
+
+if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
+    # Find include directory
+    find_path(sse2neon_INCLUDE_DIR
+        NAMES
+            sse2neon.h
+        HINTS
+            ${sse2neon_ROOT}
+        PATH_SUFFIXES
+            include
+            sse2neon/include
+    )
+
+    # Override REQUIRED if package can be installed
+    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
+        set(sse2neon_FIND_REQUIRED FALSE)
+    endif()
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(sse2neon
+        REQUIRED_VARS 
+            sse2neon_INCLUDE_DIR 
+    )
+    set(sse2neon_FOUND ${sse2neon_FOUND})
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(sse2neon_FOUND AND NOT TARGET sse2neon)
+    # INTERFACE type since we know that this is a header-only library.
+    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+    set(_sse2neon_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_sse2neon_TARGET_CREATE)
+    set_target_properties(sse2neon PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(sse2neon_INCLUDE_DIR)
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake
new file mode 100644
index 0000000000..16b47798cd
--- /dev/null
+++ b/share/cmake/modules/install/Installsse2neon.cmake
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install sse2neon (header-only version)
+# https://github.com/DLTcollab/sse2neon
+#
+#
+# Global targets defined by this module:
+#   sse2neon
+###############################################################################
+
+# Download sse2neon using FetchContent and make it available at configure time.
+
+include(FetchContent)
+
+set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon")
+FetchContent_Declare(sse2neon
+  GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
+  GIT_TAG        v1.6.0
+)
+
+# FetchContent_MakeAvailable is not available until CMake 3.14+.
+# Using FetchContent_GetProperties and FetchContent_Populate instead.
+FetchContent_GetProperties(sse2neon)
+
+if(NOT sse2neon_POPULATED)
+  FetchContent_Populate(sse2neon)
+
+  set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}")
+  file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon")
+
+  add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+  set_target_properties(sse2neon PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${_EXT_DIST_INCLUDE}/sse2neon"
+  )
+endif()
\ No newline at end of file
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index bcfd356d53..a68d5c3676 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -295,7 +295,7 @@ target_link_libraries(OpenColorIO
 if(OCIO_USE_SSE AND HAVE_NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
-            "$<BUILD_INTERFACE:sse2neon>"
+            sse2neon
     )
 endif()
 

From bef4b6bd62ffb3f1af0cf6f1b7f23ed190c11e55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 10 Feb 2023 14:45:24 -0500
Subject: [PATCH 20/81] Fix comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index abe881c627..3951a3f450 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,7 @@ include(CheckSupportARMNeon)
 
 
 ###############################################################################
-# Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
+# Add sse2neon to the build if ARM NEON intrinsics are supported.
 
 if(HAVE_NEON)
     # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is

From 55e2976540bc412c122999862ac4772832dd72ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Tue, 14 Feb 2023 10:47:42 -0500
Subject: [PATCH 21/81] Re-wording comments Now check for OCIO_USE_SSE and
 HAVE_NEON before integrating sse2neon to the build.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt        |  2 +-
 src/OpenColorIO/SSE.h | 25 +++++++++++--------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3951a3f450..81ad855fc9 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,7 +194,7 @@ include(CheckSupportARMNeon)
 ###############################################################################
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
-if(HAVE_NEON)
+if(HAVE_NEON AND OCIO_USE_SSE)
     # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 567c5a1c64..f63e08cdef 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -24,26 +24,23 @@ namespace OCIO_NAMESPACE
 
 // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
 // it is redefining two of the functions from sse2neon.
+
 #ifdef USE_SSE2NEON
-    // Overwrite the translation of _mm_max_ps and _mm_min_ps.
-    // Using vmaxnmq_f32 and vminnmq_f32 instead.
-
-    // Compare packed single-precision (32-bit) floating-point elements in a and b,
-    // and store packed maximum values in dst. dst does not follow the IEEE Standard
-    // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-    // signed-zero values.
-    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+    // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
+    // NaN handling.
+
+    // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
+    // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
+    // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
+    // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
+    // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
+    // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
+    // the first argument continues to be filtered out.
     static inline __m128 _mm_max_ps(__m128 a, __m128 b)
     {
         return vreinterpretq_m128_f32(
             vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
     }
-
-    // Compare packed single-precision (32-bit) floating-point elements in a and b,
-    // and store packed minimum values in dst. dst does not follow the IEEE Standard
-    // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-    // signed-zero values.
-    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
     static inline __m128 _mm_min_ps(__m128 a, __m128 b)
     {
         return vreinterpretq_m128_f32(

From a807ccd309d656d6d9a4d485029ae5a270e03005 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 15 Feb 2023 13:44:25 -0500
Subject: [PATCH 22/81] Removing specifics compiler options for NEON since they
 are not needed on Apple and might interfere with optimization if not careful.
 Updated CheckSupportSSE2.cmake and adding some checks in SSE.h to support
 universal build. Added comments and new define (USING_INTEL_SSE,
 USING_ARM_NEON and USING_CPP) in LogOpCPU_tests which should help with
 understand what is going on. Checking for SSE2 and ARM Neon only when
 OCIO_USE_SSE is ON.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |   6 +-
 share/cmake/utils/CheckSupportARMNeon.cmake |  30 ++--
 share/cmake/utils/CheckSupportSSE2.cmake    |  73 +++++++---
 share/cmake/utils/CompilerFlags.cmake       |  10 +-
 src/OpenColorIO/CMakeLists.txt              |  16 ++-
 src/OpenColorIO/SSE.h                       |  61 ++++----
 tests/cpu/CMakeLists.txt                    |  16 ++-
 tests/cpu/ops/log/LogOpCPU_tests.cpp        | 151 +++++++++++++-------
 8 files changed, 238 insertions(+), 125 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81ad855fc9..82ff26bcd1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,9 +186,11 @@ include(CheckSupportGL)
 
 
 ###############################################################################
-# Check for ARM neon intrinsics (armv8)
+# Check for ARM neon (if the user asked for it)
 
-include(CheckSupportARMNeon)
+if(OCIO_USE_SSE)
+    include(CheckSupportARMNeon)
+endif()
 
 
 ###############################################################################
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
index 1123e75f52..acab5e5946 100644
--- a/share/cmake/utils/CheckSupportARMNeon.cmake
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -1,21 +1,27 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 
+# Checks for ARM NEON availability
+
 include(CheckCXXSourceCompiles)
 
-set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
-set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+fp+simd+crypto+crc")
+set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+
+if(APPLE)
+    set(CMAKE_OSX_ARCHITECTURES "arm64")
+endif()
+
+set(source_code "
+#include <arm_neon.h>
+int main()
+{
+    float32x4_t v = vdupq_n_f32(0);
+    return 0;
+}")
 
-check_cxx_source_compiles ("
-    #include <arm_neon.h>
-    int main()
-    {
-        float32x4_t v = vdupq_n_f32(0);
-        return 0;
-    }"
-    HAVE_NEON)
+check_cxx_source_compiles ("${source_code}" HAVE_NEON)
 
-set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
-unset(_cmake_required_flags_old)
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
 
+unset(_cmake_osx_architectures_old)
 mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 5abd707cdb..311dbdbc87 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -6,6 +6,8 @@ include(CheckCXXSourceCompiles)
 set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
 set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
 
+set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
     # unrelated warnings causing a detection failure. So, the code disables all warnings to focus
@@ -17,30 +19,63 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     endif()
 endif()
 
-set(_SSE2_HEADER "#include <emmintrin.h>")
-if (HAVE_NEON)
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
-    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-    set(_SSE2_HEADER "#include <sse2neon.h>")
-endif()
 
-set(_SSE2_TEST_SOURCE_CODE "
-${_SSE2_HEADER}
-int main ()
-{
-    __m128d a, b;
-    double vals[2] = {0};
-    a = _mm_loadu_pd (vals);
-    b = _mm_add_pd (a,a);
-    _mm_storeu_pd (vals,b);
-    return (0);
-}")
+macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
+    set(_SSE2_TEST_SOURCE_CODE "
+    ${_check_sse2_header_}
+    int main ()
+    {
+        __m128d a, b;
+        double vals[2] = {0};
+        a = _mm_loadu_pd (vals);
+        b = _mm_add_pd (a,a);
+        _mm_storeu_pd (vals,b);
+        return (0);
+    }")
+
+    check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" ${_check_output_var_name_})
+    mark_as_advanced(${_check_output_var_name_})
+endmacro()
+
+if(NOT HAVE_NEON)
+    check_arm_neon_availability("#include <emmintrin.h>" HAVE_SSE2)
+elseif(APPLE)
+    # Test for both supported architectures
+    # x86_64 and arm64
+    set(ARCHITECTURES_LIST "arm64;x86_64")
+    
+    message(STATUS "Checking SSE2 support using SSE2NEON library for arm64 and x86_64 architectures")
+    foreach (current_arch IN LISTS ARCHITECTURES_LIST)
+
+        set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
+        
+        if(current_arch STREQUAL arm64)
+            if (HAVE_NEON)
+                set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+                set(_sse2_header_ "#include <sse2neon.h>")
 
-check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" HAVE_SSE2)
+                set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
+            endif()
+        elseif(current_arch STREQUAL x86_64)
+            set(_sse2_header_ "#include <emmintrin.h>")
+            set(_output_var_name_ "HAVE_SSE2")
+        endif()
+
+        check_arm_neon_availability("${_sse2_header_}" ${_output_var_name_})
+
+    endforeach()
+elseif(NOT APPLE)
+    # Other arm platform
+    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+    check_arm_neon_availability("#include <sse2neon.h>" HAVE_SSE2)
+endif()
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
 set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+
 unset(_cmake_required_flags_old)
 unset(_cmake_required_libraries_old)
+unset(_cmake_osx_architectures_old)
+
 
-mark_as_advanced(HAVE_SSE2)
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index dc84eddf70..5420b495d4 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -10,14 +10,16 @@ set(PLATFORM_LINK_OPTIONS "")
 
 ###############################################################################
 # Define if SSE2 can be used.
-# Check for SSE2 first since some compile flags need to be set on Apple ARM.
 
-include(CheckSupportSSE2)
+# Check for SSE2 only if the user asked for it.
+if(OCIO_USE_SSE)
+    include(CheckSupportSSE2)
+endif()
 
-if(NOT HAVE_SSE2)
+if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
     set(OCIO_USE_SSE OFF)
-endif(NOT HAVE_SSE2)
+endif()
 
 ###############################################################################
 # Compile flags
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index a68d5c3676..5fe522b0d5 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,7 +292,7 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
-if(OCIO_USE_SSE AND HAVE_NEON)
+if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
             sse2neon
@@ -330,15 +330,17 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX))
 endif()
 
 if(OCIO_USE_SSE)
-    target_compile_definitions(OpenColorIO
-        PRIVATE
-            USE_SSE
-    )
+    if(HAVE_SSE2)
+        target_compile_definitions(OpenColorIO
+            PRIVATE
+                USE_SSE
+        )
+    endif()
 
-    if(HAVE_NEON)
+    if(HAVE_SSE2_WITH_SSE2NEON)
         target_compile_definitions(OpenColorIO
             PRIVATE
-                USE_SSE2NEON
+                USE_SSE2_WITH_SSE2NEON
         )
     endif()
 endif()
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index f63e08cdef..3c40a858e1 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,14 +6,23 @@
 #define INCLUDED_OCIO_SSE_H
 
 
+#if !defined(USE_SSE)
+    #define USING_CPP 1
+#endif
+
 #ifdef USE_SSE
 
-#ifndef USE_SSE2NEON
+// If it is not arm64, same behavior as before.
+#if !defined(__aarch64__)
     #include <emmintrin.h>
-#else
-    #include <sse2neon.h>
+    #define USING_INTEL_SSE2 1
+#elif defined(__aarch64__)
+    // ARM architecture A64 (ARM64)
+    #if defined(USE_SSE2_WITH_SSE2NEON)
+        #include <sse2neon.h>
+        #define USING_INTEL_SSE2_WITH_SSE2NEON 1
+    #endif
 #endif
-#include <stdio.h>
 
 #include <OpenColorIO/OpenColorIO.h>
 
@@ -25,27 +34,29 @@ namespace OCIO_NAMESPACE
 // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
 // it is redefining two of the functions from sse2neon.
 
-#ifdef USE_SSE2NEON
-    // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
-    // NaN handling.
-
-    // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
-    // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
-    // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
-    // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
-    // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
-    // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
-    // the first argument continues to be filtered out.
-    static inline __m128 _mm_max_ps(__m128 a, __m128 b)
-    {
-        return vreinterpretq_m128_f32(
-            vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    }
-    static inline __m128 _mm_min_ps(__m128 a, __m128 b)
-    {
-        return vreinterpretq_m128_f32(
-            vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    }
+#if defined(__aarch64__)
+    #if defined(USE_SSE2_WITH_SSE2NEON)
+        // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
+        // NaN handling.
+
+        // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
+        // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
+        // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
+        // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
+        // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
+        // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
+        // the first argument continues to be filtered out.
+        static inline __m128 _mm_max_ps(__m128 a, __m128 b)
+        {
+            return vreinterpretq_m128_f32(
+                vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+        }
+        static inline __m128 _mm_min_ps(__m128 a, __m128 b)
+        {
+            return vreinterpretq_m128_f32(
+                vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+        }
+    #endif
 #endif
 
 // Macros for alignment declarations
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 3d6cbc7506..53f50d1031 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -26,7 +26,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
-    if(OCIO_USE_SSE AND HAVE_NEON)
+    if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
         target_link_libraries(${TEST_BINARY}
             PRIVATE
                 sse2neon
@@ -52,15 +52,17 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     endif(PRIVATE_INCLUDES)
 
     if(OCIO_USE_SSE)
-        target_compile_definitions(${TEST_BINARY}
-            PRIVATE
-                USE_SSE
-        )
+        if (HAVE_SSE2)
+            target_compile_definitions(${TEST_BINARY}
+                PRIVATE
+                    USE_SSE
+            )
+        endif()
 
-        if(HAVE_NEON)
+        if(HAVE_SSE2_WITH_SSE2NEON)
             target_compile_definitions(${TEST_BINARY}
                 PRIVATE
-                    USE_SSE2NEON
+                    USE_SSE2_WITH_SSE2NEON
             )
         endif()
     endif(OCIO_USE_SSE)
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 7ac48ed1f6..c73541afb6 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -9,7 +9,6 @@
 
 namespace OCIO = OCIO_NAMESPACE;
 
-
 constexpr float qnan = std::numeric_limits<float>::quiet_NaN();
 constexpr float inf = std::numeric_limits<float>::infinity();
 
@@ -23,6 +22,7 @@ void TestLog(float logBase)
                                   0.f,        0.f,  0.f,     inf,
                                  -inf,       -inf, -inf,     0.f,
                                   0.f,        0.f,  0.f,    -inf };
+                                  
     float rgba[32] = {};
 
     OCIO::ConstLogOpDataRcPtr logOp = std::make_shared<OCIO::LogOpData>(
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -52,17 +52,26 @@ void TestLog(float logBase)
             expected = logf(std::max(minValue, (float)expected)) / logf(logBase);
         }
 
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_CLOSE(result, expected, error);
     }
 
     const float resMin = logf(minValue) / logf(logBase);
+
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
+
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], resMin, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
+
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -75,10 +84,16 @@ void TestLog(float logBase)
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
+
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], resMin, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
+
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
+
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], resMin, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -127,34 +142,45 @@ void TestAntiLog(float logBase)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison.
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[8], inf);
-    #endif
-#else
+
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[8], inf);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
+
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
-    #endif
-#else
+
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
+
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
-#ifdef USE_SSE
+
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
     OCIO_CHECK_EQUAL(rgba[24], 0.0f);
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
+
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -267,43 +293,48 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison.
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[8], inf);
-    #endif
-#else
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[8], inf);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
 
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], res0, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
-    #endif
-#else
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
     OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol);
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -407,31 +438,40 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_CLOSE(result, expected, error);
     }
 
     const float res0 = ComputeLin2LogEval(0.0f, redP);
     const float resMin = ComputeLin2LogEval(-100.0f, redP);
 
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], res0, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], res0, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], res0, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -462,25 +502,28 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
 #endif // USE_SSE
 
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba[0], -0.168771237955f, error);
     OCIO_CHECK_CLOSE(rgba[1], -0.048771237955f, error);
     OCIO_CHECK_CLOSE(rgba[2], -0.036771237955f, error);
+
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
 #endif // USE_SSE
-
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
-#ifdef USE_SSE
+   // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -500,19 +543,23 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRendererNoLS = OCIO::GetLogRenderer(lognols, true);
     pRendererNoLS->apply(rgbaImage, rgba_nols, numPixels);
 
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[0], -0.325512374199f, error);
     OCIO_CHECK_CLOSE(rgba_nols[1], -0.127141806077f, error);
     OCIO_CHECK_CLOSE(rgba_nols[2], -0.107304749265f, error);
+    
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
 #endif // USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[6], 0.68141615509f, error);
-    OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
+    OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -530,19 +577,25 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
 #endif // USE_SSE
+
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[0], -24.6f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[1], -0.264385618977f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[2], -0.20700938942f, error2);
+
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[4], 0.028548034423f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[5], 0.170878935551f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[6], 0.68141615509, error2);
+
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -553,14 +606,8 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 {
     // Inverse of previous test.
-    const float rgbaImage[12] = { -0.168771237955f,
-                                  -0.048771237955f,
-                                  -0.036771237955f,
-                                   0.f,
-                                   0.047228762045f,
-                                   0.170878935551f,
-                                   0.68141615509f,
-                                   0.f,
+    const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f,
+                                   0.047228762045f, 0.170878935551f, 0.68141615509f, 0.f,
                                   -inf,   inf,   qnan,  0.0f };
 
     float rgba[12] = {};
@@ -574,23 +621,29 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
 #endif // USE_SSE
+
+    // Evaluating output for input rgbaImage[0-2] = 
+    // { -0.168771237955f, -0.048771237955f, -0.036771237955f, ... }.
     OCIO_CHECK_CLOSE(rgba[0], -0.1f, error);
     OCIO_CHECK_CLOSE(rgba[1],  0.0f, error);
     OCIO_CHECK_CLOSE(rgba[2],  0.01f, error);
+
+    // Evaluating output for input rgbaImage[4-6] = 
+    // { 0.047228762045f, 0.170878935551f, 0.68141615509f, ... }.
     OCIO_CHECK_CLOSE(rgba[4],  0.08f, error);
     OCIO_CHECK_CLOSE(rgba[5],  0.16f, error);
     OCIO_CHECK_CLOSE(rgba[6],  1.16f, 10.0f * error);
+
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba[8], -inf);
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
-    #endif
-#else
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[9], inf);
 #endif
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));

From c93e3b4d5677701df732d0cb7c9cae0c48b0df7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 09:18:49 -0500
Subject: [PATCH 23/81] Tweaking comments, some cmake variables naming and
 other minors changes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |  4 +-
 share/cmake/utils/CheckSupportARMNeon.cmake |  6 +--
 share/cmake/utils/CheckSupportSSE2.cmake    | 40 +++++++---------
 share/cmake/utils/CompilerFlags.cmake       |  1 -
 src/OpenColorIO/SSE.h                       |  6 +--
 tests/cpu/ops/log/LogOpCPU_tests.cpp        | 52 ++++++++++-----------
 6 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82ff26bcd1..2bf2da40d1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,7 +186,7 @@ include(CheckSupportGL)
 
 
 ###############################################################################
-# Check for ARM neon (if the user asked for it)
+# Check for ARM neon
 
 if(OCIO_USE_SSE)
     include(CheckSupportARMNeon)
@@ -197,7 +197,7 @@ endif()
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
 if(HAVE_NEON AND OCIO_USE_SSE)
-    # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
+    # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
     # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
index acab5e5946..5d17854757 100644
--- a/share/cmake/utils/CheckSupportARMNeon.cmake
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -5,7 +5,7 @@
 
 include(CheckCXXSourceCompiles)
 
-set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(APPLE)
     set(CMAKE_OSX_ARCHITECTURES "arm64")
@@ -21,7 +21,7 @@ int main()
 
 check_cxx_source_compiles ("${source_code}" HAVE_NEON)
 
-set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
-unset(_cmake_osx_architectures_old)
+unset(_cmake_osx_architectures_orig)
 mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 311dbdbc87..26bc0b396d 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -3,10 +3,9 @@
 
 include(CheckCXXSourceCompiles)
 
-set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
-set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
-
-set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
+set(_cmake_required_libraries_orig "${CMAKE_REQUIRED_LIBRARIES}")
+set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
@@ -20,7 +19,7 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
 endif()
 
 
-macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
+macro(check_sse2_availability _check_sse2_header_ _check_output_var_name_)
     set(_SSE2_TEST_SOURCE_CODE "
     ${_check_sse2_header_}
     int main ()
@@ -38,8 +37,8 @@ macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
 endmacro()
 
 if(NOT HAVE_NEON)
-    check_arm_neon_availability("#include <emmintrin.h>" HAVE_SSE2)
-elseif(APPLE)
+    check_sse2_availability("#include <emmintrin.h>" HAVE_SSE2)
+elseif(APPLE AND HAVE_NEON)
     # Test for both supported architectures
     # x86_64 and arm64
     set(ARCHITECTURES_LIST "arm64;x86_64")
@@ -50,32 +49,25 @@ elseif(APPLE)
         set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
         
         if(current_arch STREQUAL arm64)
-            if (HAVE_NEON)
-                set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-                set(_sse2_header_ "#include <sse2neon.h>")
-
-                set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
-            endif()
+            set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+            set(_sse2_header_ "#include <sse2neon.h>")
+            set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
         elseif(current_arch STREQUAL x86_64)
             set(_sse2_header_ "#include <emmintrin.h>")
             set(_output_var_name_ "HAVE_SSE2")
         endif()
 
-        check_arm_neon_availability("${_sse2_header_}" ${_output_var_name_})
+        check_sse2_availability("${_sse2_header_}" ${_output_var_name_})
 
     endforeach()
-elseif(NOT APPLE)
-    # Other arm platform
-    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-    check_arm_neon_availability("#include <sse2neon.h>" HAVE_SSE2)
 endif()
 
-set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
-set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
-set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}")
+set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_orig}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
-unset(_cmake_required_flags_old)
-unset(_cmake_required_libraries_old)
-unset(_cmake_osx_architectures_old)
+unset(_cmake_required_flags_orig)
+unset(_cmake_required_libraries_orig)
+unset(_cmake_osx_architectures_orig)
 
 
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 5420b495d4..a9a95d3753 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -11,7 +11,6 @@ set(PLATFORM_LINK_OPTIONS "")
 ###############################################################################
 # Define if SSE2 can be used.
 
-# Check for SSE2 only if the user asked for it.
 if(OCIO_USE_SSE)
     include(CheckSupportSSE2)
 endif()
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 3c40a858e1..12b43e8972 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,11 +6,9 @@
 #define INCLUDED_OCIO_SSE_H
 
 
-#if !defined(USE_SSE)
+#ifndef USE_SSE
     #define USING_CPP 1
-#endif
-
-#ifdef USE_SSE
+#else
 
 // If it is not arm64, same behavior as before.
 #if !defined(__aarch64__)
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index c73541afb6..083cd6a2b9 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -60,17 +60,17 @@ void TestLog(float logBase)
 
     const float resMin = logf(minValue) / logf(logBase);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], resMin, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     if (logBase == 10.0f)
     {
@@ -85,15 +85,15 @@ void TestLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], resMin, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], resMin, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -148,7 +148,7 @@ void TestAntiLog(float logBase)
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[8], inf);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -156,11 +156,11 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -168,11 +168,11 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
@@ -180,7 +180,7 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -301,7 +301,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[8], inf);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -310,11 +310,11 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], res0, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -322,11 +322,11 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
@@ -334,7 +334,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -447,15 +447,15 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     const float res0 = ComputeLin2LogEval(0.0f, redP);
     const float resMin = ComputeLin2LogEval(-100.0f, redP);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], res0, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
@@ -463,20 +463,20 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], res0, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], res0, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
 
-OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
+OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 {
     constexpr int numPixels = 3;
     constexpr int numValues = 4 * numPixels;
@@ -603,7 +603,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO_CHECK_CLOSE(rgba_nobreak[10], -24.6f, error2);
 }
 
-OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
+OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 {
     // Inverse of previous test.
     const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f,

From 3030e8bc876b9d564d0b8858c56fe94a46e3dc66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 11:34:52 -0500
Subject: [PATCH 24/81] Fixing issue with anti-log (SSE implementation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/SSE.h                | 80 ++++++++++++++++++----------
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 29 ----------
 2 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 12b43e8972..57ea87a08f 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -109,10 +109,10 @@ inline __m128 isNegativeSpecial(const __m128 x)
   return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(x), SIGN_SHIFT));
 }
 
-// Select function in SSE version 2
+// Bit-wise select function in SSE version 2
 //
-// Return the parameter arg_false when the parameter mask is 0x0,
-// or the parameter arg_true when the mask is 0xffffffff.
+// Return the parameter arg_false bit where the parameter mask is 0x0,
+// return the parameter arg_true bit where the mask is 0x1.
 //
 // Algorithm Explanation:
 //
@@ -142,7 +142,11 @@ inline __m128 isNegativeSpecial(const __m128 x)
 //
 inline __m128 sseSelect(const __m128& mask, const __m128& arg_true, const __m128& arg_false)
 {
-    return _mm_xor_ps( arg_false, _mm_and_ps( mask, _mm_xor_ps( arg_true, arg_false ) ) );
+    return _mm_xor_ps(                                  // bit-wise XOR of arg_false, (...)
+        arg_false, 
+        _mm_and_ps(                                     // bit-wise AND of mask, (...)
+            mask, 
+            _mm_xor_ps( arg_true, arg_false ) ) );      // bit-wise XOR of arg_true, arg_false
 }
 
 // Coefficients of Chebyshev (minimax) degree 5 polynomial
@@ -162,6 +166,10 @@ static const __m128 PNEXP2 = _mm_set1_ps((float)2.414427569091865207710e-1);
 static const __m128 PNEXP1 = _mm_set1_ps((float)6.930038344665415134202e-1);
 static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 
+// Note: The above polynomials have been chosen to acheive a precision of
+// approximately 15 bits of mantissa.
+
+
 // log2 function in SSE version 2
 //
 // The function log2() is evaluated by performing argument
@@ -169,12 +177,14 @@ static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 // over a restricted range.
 inline __m128 sseLog2(__m128 x)
 {
-    // y = log2( x ) = log2( 2^exposant * mantissa ) 
-    //               = exposant + log2( mantissa )
+    // y = log2( x ) = log2( 2^exponant * mantissa ) 
+    //               = exponant + log2( mantissa )
 
     __m128 mantissa
-        = _mm_or_ps(
-            _mm_andnot_ps(_mm_castsi128_ps(EMASK), x), EONE);
+        = _mm_or_ps(                                    // OR with EONE
+            _mm_andnot_ps(                              // NOT(EMASK) AND x
+                _mm_castsi128_ps(EMASK), x),            // reinterpret cast int to float
+            EONE);
 
     __m128 log2
         = _mm_add_ps(
@@ -198,14 +208,15 @@ inline __m128 sseLog2(__m128 x)
             PNLOG0);
 
     __m128i exponent
-        = _mm_sub_epi32(
-            _mm_srli_epi32(
-                _mm_and_si128(_mm_castps_si128(x),
+        = _mm_sub_epi32(                                // subtract EBIAS
+            _mm_srli_epi32(                             // right-shift by EXP_SHIFT
+                _mm_and_si128(_mm_castps_si128(x),      // bit-wise AND with EMASK
                     EMASK),
                 EXP_SHIFT),
             EBIAS);
 
-    log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(exponent));
+    log2 = _mm_add_ps(log2, 
+                      _mm_cvtepi32_ps(exponent));       // convert exponent to float
 
     return log2;
 }
@@ -224,24 +235,30 @@ inline __m128 sseExp2(__m128 x)
     // Compute the largest integer not greater than x, i.e., floor(x)
     // Note: cvttps_epi32 simply cast the float value to int. That means cvttps_epi32(-2.7) = -2
     // rather than -3, hence for negative numbers we need to add -1. This ensures that "fraction"
-    // is always in the range [0, 1).
+    // is always in the range [0, 1).  Note that _mm_castps_si128(0xFFFFFFFF) is -1.
+    // If x is outside the INT_MIN to INT_MAX range, _mm_cvttps_epi32 will return 0x80000000
+    // (i.e. INT_MIN, just the sign bit set), which Intel calls the "integer indefinite" value. 
+    // When 1 is subtracted from INT_MIN, it gives INT_MAX.  So floor_x is wrong for values
+    // outside [INT_MIN, INT_MAX] but it's ignored thanks to the checks at the bottom.
+    // It's also wrong for x=NaN, but again it's ok since the polynomial returns NaN and
+    // hence the output is NaN, regardless of floor_x.
     __m128i floor_x
-        = _mm_add_epi32(
-            _mm_cvttps_epi32(x),
-            _mm_castps_si128(
-                _mm_cmpnle_ps(EZERO, x)));
+        = _mm_add_epi32(                                // add a pair of integer arguments
+            _mm_cvttps_epi32(x),                        // convert float to int via truncation
+            _mm_castps_si128(                           // reinterpret cast float to int
+                _mm_cmpnle_ps(EZERO, x)));              // NOT( EZERO <= x ) ? 0xFFFFFFFF : 0
 
     // Compute exp2(floor_x) by moving floor_x to the exponent bits of the floating-point number.
     __m128 zf
-        = _mm_castsi128_ps(
-            _mm_slli_epi32(
-                _mm_add_epi32(floor_x, EBIAS),
+        = _mm_castsi128_ps(                             // reinterpret cast int to float
+            _mm_slli_epi32(                             // left shift by EXP_SHIFT
+                _mm_add_epi32(floor_x, EBIAS),          // add a pair of integer arguments
                 EXP_SHIFT));
 
-    __m128 iexp = _mm_cvtepi32_ps(floor_x);
-    __m128 fraction = _mm_sub_ps(x, iexp);
+    __m128 iexp = _mm_cvtepi32_ps(floor_x);             // convert floor_x to float
+    __m128 fraction = _mm_sub_ps(x, iexp);              // x - iexp
 
-    // Compute exp2(fraction) using a polynomial approximation
+    // Compute exp2(fraction) using a polynomial approximation.
     __m128 mexp
         = _mm_add_ps(
             _mm_mul_ps(
@@ -259,19 +276,26 @@ inline __m128 sseExp2(__m128 x)
                 fraction),
             PNEXP0);
 
-    __m128 exp2 = _mm_mul_ps(zf, mexp);
+    __m128 exp2 = _mm_mul_ps(zf, mexp);                 // zf * mexp
 
     // Handle underflow:
     // If the (unbiased) exponent of zf is less than -126, the result is smaller than
     // the smallest representable floating-point number and an underflow computation is
     // potentially happening. When this happens, force the result to zero.
-    exp2 = _mm_andnot_ps(_mm_cmplt_ps(iexp, ENEG126), exp2);
+    // Note that as described above, floor_x is inaccurate, so the test here uses x.
+    exp2 = _mm_andnot_ps(                               // NOT(...) AND exp2
+        _mm_cmplt_ps(x, ENEG126),                       // iexp < ENEG126 ? 0xFFFFFFFF : 0
+        exp2);
 
     // Handle overflow:
     // If the (unbiased) exponent of zf is greater than 127, the result is larger than
     // the largest representable floating-point number and an overflow computation is
     // potentially happening. When this happens, force the result to positive infinity.
-    exp2 = sseSelect(_mm_cmpgt_ps(iexp, EPOS127), EPOSINF, exp2);
+    // Note that as described above, floor_x is inaccurate, so the test here uses x.
+    exp2 = sseSelect(                                   // (...) is a mask to select EPOSINF, exp2
+        _mm_cmpgt_ps(x, EPOS127),                       // iexp > EPOS127 ? 0xFFFFFFFF : 0
+        EPOSINF, 
+        exp2);
 
     return exp2;
 }
@@ -630,7 +654,7 @@ inline void sseSinCos(const float x, float& sin_x, float& cos_x)
 } // namespace OCIO_NAMESPACE
 
 
-#endif
+#endif  // USE_SSE
 
 
-#endif
+#endif  // INCLUDED_OCIO_SSE_H
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 083cd6a2b9..35ec055856 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -149,11 +149,7 @@ void TestAntiLog(float logBase)
     }
 
     // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[8], inf);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
-#endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
     // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
@@ -161,11 +157,7 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
-#endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
     // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
@@ -173,11 +165,7 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
     // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
-    OCIO_CHECK_EQUAL(rgba[24], inf);
-#else
     OCIO_CHECK_EQUAL(rgba[24], 0.0f);
-#endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
     // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
@@ -302,12 +290,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
     // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[8], inf);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
-#endif
-
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
     // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
@@ -315,11 +298,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
-#endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
     // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
@@ -327,11 +306,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
     // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
-    OCIO_CHECK_EQUAL(rgba[24], inf);
-#else
     OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol);
-#endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
     // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
@@ -641,11 +616,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba[8], -inf);
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[9], inf);
-#endif
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
 }
 

From f4993c6dda0cf29baa1e28709c99577a0cc12cd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 11:57:32 -0500
Subject: [PATCH 25/81] Adding a new option OCIO_USE_SIMD which does the same
 thing as OCIO_USE_SSE (they are sync up). Will replace OCIO_USE_SSE
 eventually.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .github/workflows/ci_workflow.yml                |  6 +++---
 CMakeLists.txt                                   | 10 ++++++----
 docs/quick_start/installation.rst                |  2 ++
 share/cmake/modules/install/InstallOpenEXR.cmake |  2 ++
 share/cmake/utils/CompilerFlags.cmake            |  4 ++--
 src/OpenColorIO/CMakeLists.txt                   |  4 ++--
 tests/cpu/CMakeLists.txt                         |  6 +++---
 tests/gpu/CMakeLists.txt                         |  4 ++--
 tests/osl/CMakeLists.txt                         |  4 ++--
 9 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
index bf98506e1a..0fcf7e2c37 100644
--- a/.github/workflows/ci_workflow.yml
+++ b/.github/workflows/ci_workflow.yml
@@ -204,7 +204,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
@@ -345,7 +345,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
@@ -493,7 +493,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bf2da40d1..7ea95a2463 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,11 +173,13 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
-
+# SIMD refers to either SIMD or Advanced SIMD terminology.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+mark_as_advanced(OCIO_USE_SSE)
+option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
+option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
-
 ###############################################################################
 # GPU configuration
 message(STATUS "")
@@ -188,7 +190,7 @@ include(CheckSupportGL)
 ###############################################################################
 # Check for ARM neon
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     include(CheckSupportARMNeon)
 endif()
 
@@ -196,7 +198,7 @@ endif()
 ###############################################################################
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
-if(HAVE_NEON AND OCIO_USE_SSE)
+if(HAVE_NEON AND OCIO_USE_SIMD)
     # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 788ae7e16b..37dbe4c5df 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -278,6 +278,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding)
 - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins)
 - ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations)
+  - ``OCIO_USE_SSE`` will be deprecated in favor of ``OCIO_USE_SIMD`` at some point in the future.
+- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations)
 - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests)
 - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests)
 - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering)
diff --git a/share/cmake/modules/install/InstallOpenEXR.cmake b/share/cmake/modules/install/InstallOpenEXR.cmake
index 419fb4aa59..ffbd3a1f55 100644
--- a/share/cmake/modules/install/InstallOpenEXR.cmake
+++ b/share/cmake/modules/install/InstallOpenEXR.cmake
@@ -201,6 +201,7 @@ if(_OpenEXR_TARGET_CREATE)
         IMPORTED_LOCATION ${IlmThread_LIBRARY}
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}"
         INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;OpenEXR::IlmThreadConfig;OpenEXR::Iex;Threads::Threads"
+        STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols"
     )
     set_target_properties(OpenEXR::IlmThreadConfig PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR};${OpenEXR_INCLUDE_DIR}/OpenEXR"
@@ -217,6 +218,7 @@ if(_OpenEXR_TARGET_CREATE)
         IMPORTED_LOCATION ${OpenEXRCore_LIBRARY}
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}"
         INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;ZLIB::ZLIB;\$<LINK_ONLY:Imath::ImathConfig>"
+        STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols"
     )
     set_target_properties(OpenEXR::OpenEXRUtil PROPERTIES
         IMPORTED_LOCATION ${OpenEXRUtil_LIBRARY}
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index a9a95d3753..d9aa05d43f 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -11,13 +11,13 @@ set(PLATFORM_LINK_OPTIONS "")
 ###############################################################################
 # Define if SSE2 can be used.
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     include(CheckSupportSSE2)
 endif()
 
 if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
-    set(OCIO_USE_SSE OFF)
+    set(OCIO_USE_SIMD OFF)
 endif()
 
 ###############################################################################
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 5fe522b0d5..791ade3f91 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,7 +292,7 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
-if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
+if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
             sse2neon
@@ -329,7 +329,7 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX))
     set_property(TARGET OpenColorIO PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     if(HAVE_SSE2)
         target_compile_definitions(OpenColorIO
             PRIVATE
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 53f50d1031..245caeb9cb 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -26,7 +26,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
-    if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
+    if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
         target_link_libraries(${TEST_BINARY}
             PRIVATE
                 sse2neon
@@ -51,7 +51,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
         )
     endif(PRIVATE_INCLUDES)
 
-    if(OCIO_USE_SSE)
+    if(OCIO_USE_SIMD)
         if (HAVE_SSE2)
             target_compile_definitions(${TEST_BINARY}
                 PRIVATE
@@ -65,7 +65,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
                     USE_SSE2_WITH_SSE2NEON
             )
         endif()
-    endif(OCIO_USE_SSE)
+    endif(OCIO_USE_SIMD)
 
     if(WIN32)
         # A windows application linking to eXpat static libraries must
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index 2245fbf1d5..4a19e6b5d6 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -26,12 +26,12 @@ set(SOURCES
 
 add_executable(test_gpu_exec ${SOURCES})
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
 	target_compile_definitions(test_gpu_exec
 		PRIVATE
 			USE_SSE
 	)
-endif(OCIO_USE_SSE)
+endif(OCIO_USE_SIMD)
 
 set_target_properties(test_gpu_exec PROPERTIES 
 	COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
diff --git a/tests/osl/CMakeLists.txt b/tests/osl/CMakeLists.txt
index 3316720d47..3f327ff295 100644
--- a/tests/osl/CMakeLists.txt
+++ b/tests/osl/CMakeLists.txt
@@ -18,12 +18,12 @@ set(SOURCES
 
 add_executable(test_osl_exec ${SOURCES})
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     target_compile_definitions(test_osl_exec
         PRIVATE
             USE_SSE
     )
-endif(OCIO_USE_SSE)
+endif(OCIO_USE_SIMD)
 
 set_target_properties(test_osl_exec PROPERTIES 
     COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"

From e1ead90011bff112ea3966c8438a7e06dfb54d86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 08:57:13 -0500
Subject: [PATCH 26/81] Fix last issue discovered by unit test in sseExp2
 Reverted defined changes in LogOpCPU_tests Updated ocio.bat with
 OCIO_USE_SIMD Minors comments changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                       |  1 -
 docs/quick_start/installation.rst    |  5 ++---
 share/dev/windows/ocio.bat           |  2 +-
 src/OpenColorIO/SSE.h                | 14 ++++++--------
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 22 +++++++++++-----------
 5 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ea95a2463..df071b999d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,7 +173,6 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
-# SIMD refers to either SIMD or Advanced SIMD terminology.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 37dbe4c5df..64b973bc0a 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -277,9 +277,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_USE_OIIO_FOR_APPS=OFF`` (Set ON to build tools with OpenImageIO rather than OpenEXR)
 - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding)
 - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins)
-- ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations)
-  - ``OCIO_USE_SSE`` will be deprecated in favor of ``OCIO_USE_SIMD`` at some point in the future.
-- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations)
+- ``-DOCIO_USE_SSE=ON`` (Deprecated -- please use OCIO_USE_SIMD)
+- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON)
 - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests)
 - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests)
 - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering)
diff --git a/share/dev/windows/ocio.bat b/share/dev/windows/ocio.bat
index 7f24bc279b..a4762a97d9 100644
--- a/share/dev/windows/ocio.bat
+++ b/share/dev/windows/ocio.bat
@@ -206,7 +206,7 @@ if !DO_CONFIGURE!==1 (
         -DOCIO_BUILD_TESTS=ON^
         -DOCIO_BUILD_GPU_TESTS=ON^
         -DOCIO_BUILD_DOCS=OFF^
-        -DOCIO_USE_SSE=ON^
+        -DOCIO_USE_SIMD=ON^
         -DOCIO_WARNING_AS_ERROR=ON^
         -DOCIO_BUILD_JAVA=OFF^
         "!OCIO_PATH!"
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 57ea87a08f..95d7b88d49 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,19 +6,17 @@
 #define INCLUDED_OCIO_SSE_H
 
 
-#ifndef USE_SSE
-    #define USING_CPP 1
-#else
+#if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON)
 
 // If it is not arm64, same behavior as before.
 #if !defined(__aarch64__)
-    #include <emmintrin.h>
-    #define USING_INTEL_SSE2 1
+    #if defined(USE_SSE)
+        #include <emmintrin.h>
+    #endif
 #elif defined(__aarch64__)
     // ARM architecture A64 (ARM64)
     #if defined(USE_SSE2_WITH_SSE2NEON)
         #include <sse2neon.h>
-        #define USING_INTEL_SSE2_WITH_SSE2NEON 1
     #endif
 #endif
 
@@ -82,7 +80,7 @@ static const __m128i EBIAS = _mm_set1_epi32(EXP_BIAS);
 static const __m128 EONE    = _mm_set1_ps(1.0f);
 static const __m128 EZERO   = _mm_set1_ps(0.0f);
 static const __m128 ENEG126 = _mm_set1_ps(-126.0f);
-static const __m128 EPOS127 = _mm_set1_ps(127.0f);
+static const __m128 EPOS128 = _mm_set1_ps(128.0f);
 
 static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits<float>::infinity());
 
@@ -293,7 +291,7 @@ inline __m128 sseExp2(__m128 x)
     // potentially happening. When this happens, force the result to positive infinity.
     // Note that as described above, floor_x is inaccurate, so the test here uses x.
     exp2 = sseSelect(                                   // (...) is a mask to select EPOSINF, exp2
-        _mm_cmpgt_ps(x, EPOS127),                       // iexp > EPOS127 ? 0xFFFFFFFF : 0
+        _mm_cmpge_ps(x, EPOS128),                       // iexp > EPOS128 ? 0xFFFFFFFF : 0
         EPOSINF, 
         exp2);
 
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 35ec055856..04430b75f2 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -71,7 +71,7 @@ void TestLog(float logBase)
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -431,7 +431,7 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -498,7 +498,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -534,7 +534,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From bfb21b88cb1171c7f4cea750f7c8d028edee9760 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 09:24:39 -0500
Subject: [PATCH 27/81] Fixing typo, using #ifdef since this is what we were
 using before.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 04430b75f2..06e98fe0da 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -71,7 +71,7 @@ void TestLog(float logBase)
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USE_SSE
+#ifdef USE_SSE
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -431,7 +431,7 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -498,7 +498,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -534,7 +534,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From 563dcfa7183d73176b1fedf70a2c5737b5177655 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 23 Feb 2023 10:40:07 -0500
Subject: [PATCH 28/81] Adding a line for universal build option for Cmake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 docs/quick_start/installation.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 64b973bc0a..ab5dc74c0e 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -286,6 +286,10 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation)
 - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation)
 
+On the newest Apple chipset (M1+), a universal build can be done with the following option:
+
+- ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
+
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
 and therefore you will be limited to using OpenEXR files with these tools rather than the

From de1b4d4608d1c3ad73ed63dc734e8049aa9786b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 3 Mar 2023 11:41:37 -0500
Subject: [PATCH 29/81] Documentations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                    | 4 +++-
 docs/quick_start/installation.rst | 3 +++
 src/OpenColorIO/SSE.h             | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df071b999d..aaae15a0e6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,9 +173,11 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
+# TODO Remove OCIO_USE_SSE once it is fully deprecated.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+# TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
-option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
+option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
 option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index ab5dc74c0e..dfe55cdd8b 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -290,6 +290,9 @@ On the newest Apple chipset (M1+), a universal build can be done with the follow
 
 - ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
 
+Please note that OCIO dependencies must be built as universal libraries as well. If you can't build all the 
+dependencies as universal libraries, you can set ``OCIO_INSTAL_EXT_PACKAGES=ALL`` and OCIO will handle it.
+
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
 and therefore you will be limited to using OpenEXR files with these tools rather than the
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 95d7b88d49..81bd15c0dc 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -8,7 +8,7 @@
 
 #if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON)
 
-// If it is not arm64, same behavior as before.
+// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM).
 #if !defined(__aarch64__)
     #if defined(USE_SSE)
         #include <emmintrin.h>
@@ -110,7 +110,7 @@ inline __m128 isNegativeSpecial(const __m128 x)
 // Bit-wise select function in SSE version 2
 //
 // Return the parameter arg_false bit where the parameter mask is 0x0,
-// return the parameter arg_true bit where the mask is 0x1.
+// return the parameter arg_true bit where the mask is 1.
 //
 // Algorithm Explanation:
 //

From 71ab18cf5d6c9f83500a337c82a8a5a6f388e52d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 6 Mar 2023 15:16:29 -0500
Subject: [PATCH 30/81] Universal build is the default for APPLE documentation
 tweak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                    | 10 +++++++++-
 docs/quick_start/installation.rst | 15 +++++++++------
 src/OpenColorIO/SSE.h             |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aaae15a0e6..f3b7d7d31a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,14 @@ if(APPLE AND NOT DEFINED CMAKE_OSX_DEPLOYMENT_TARGET)
 endif()
 
 
+###############################################################################
+# Universal library is the default for ARM architecture under MacOS.
+
+if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL ""))
+    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE)
+endif()
+
+
 ###############################################################################
 # Project definition.
 
@@ -174,7 +182,7 @@ endif()
 ###############################################################################
 # Optimization / internal linking preferences
 # TODO Remove OCIO_USE_SSE once it is fully deprecated.
-option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)" ON)
 # TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index dfe55cdd8b..fc58c5ec6f 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -286,12 +286,15 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation)
 - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation)
 
-On the newest Apple chipset (M1+), a universal build can be done with the following option:
-
-- ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
-
-Please note that OCIO dependencies must be built as universal libraries as well. If you can't build all the 
-dependencies as universal libraries, you can set ``OCIO_INSTAL_EXT_PACKAGES=ALL`` and OCIO will handle it.
+On the MacOS under the ARM architecture, the default is to make a universal build 
+(natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option 
+may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``.
+
+When doing a universal build, note that the OCIO dependencies must be built as universal libraries 
+too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if 
+any of your installed libraries are not universal. The easiest way to address this is to set 
+OCIO_INSTALL_EXT_PACKAGES=ALL in order to let OCIO build everything. Alternatively, you may set 
+CMAKE_OSX_ARCHITECTURES to just the platform you are targeting.
 
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 81bd15c0dc..1aa853997e 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -33,7 +33,7 @@ namespace OCIO_NAMESPACE
 #if defined(__aarch64__)
     #if defined(USE_SSE2_WITH_SSE2NEON)
         // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
-        // NaN handling.
+        // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior.
 
         // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
         // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 

From 98a7c43283d46bb5b65a182d5c6944370a8d28c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Tue, 7 Mar 2023 09:59:05 -0500
Subject: [PATCH 31/81] documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3b7d7d31a..687c0e4883 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
 
 
 ###############################################################################
-# Universal library is the default for ARM architecture under MacOS.
+# By default, build the library, tests, tools, and Python binding as universal binaries for macOS.
 
 if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL ""))
     set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE)

From 17dfee8838e33125a23df5b7c0366c569ab4e1ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 13 Mar 2023 10:49:01 -0400
Subject: [PATCH 32/81] Fixing issue for the static build test in CI workflow
 Removing Findsse2neon as it is not needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                | 26 +++++++--
 share/cmake/modules/Findsse2neon.cmake        | 58 -------------------
 .../modules/install/Installsse2neon.cmake     | 15 +++--
 share/cmake/utils/CheckSupportSSE2.cmake      |  8 +--
 src/OpenColorIO/CMakeLists.txt                |  5 +-
 tests/cpu/CMakeLists.txt                      |  5 +-
 6 files changed, 39 insertions(+), 78 deletions(-)
 delete mode 100644 share/cmake/modules/Findsse2neon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 687c0e4883..e972eb8023 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,11 +211,29 @@ if(HAVE_NEON AND OCIO_USE_SIMD)
     # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
-    # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
-    # recommended version branch is merged).
-    find_package(sse2neon QUIET)
-    if (NOT sse2neon_FOUND)
+    # Sse2neon is not treated like an imported target. The logic to find sse2neon is here because 
+    # a find module is not suitable for sse2neon's use case.
+    find_path(sse2neon_INCLUDE_DIR
+        NAMES
+            sse2neon.h
+        HINTS
+            ${sse2neon_ROOT}
+        PATH_SUFFIXES
+            sse2neon
+            include
+            sse2neon/include
+    )
+
+    if (NOT sse2neon_INCLUDE_DIR)
         include(Installsse2neon)
+    else()
+        # Any changes to the following lines must be replicated in Installsse2neon.cmake as well.
+        # Create a target for sse2neon (non-imported)
+        add_library(sse2neon INTERFACE)
+        # Add the include directories to the target.
+        target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
+        # Ignore the warnings coming from sse2neon.h as they are false positives.
+        target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
     endif()
 endif()
 
diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
deleted file mode 100644
index 7dea6740cb..0000000000
--- a/share/cmake/modules/Findsse2neon.cmake
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-#
-# Locate sse2neon (header-only version)
-#
-# Variables defined by this module:
-#   sse2neon_FOUND          - Indicate whether the library was found or not
-#   sse2neon_INCLUDE_DIR    - Location of the header files
-#
-# Global targets defined by this module:
-#   sse2neon
-###############################################################################
-### Try to find package ###
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-    # Find include directory
-    find_path(sse2neon_INCLUDE_DIR
-        NAMES
-            sse2neon.h
-        HINTS
-            ${sse2neon_ROOT}
-        PATH_SUFFIXES
-            include
-            sse2neon/include
-    )
-
-    # Override REQUIRED if package can be installed
-    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
-        set(sse2neon_FIND_REQUIRED FALSE)
-    endif()
-
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(sse2neon
-        REQUIRED_VARS 
-            sse2neon_INCLUDE_DIR 
-    )
-    set(sse2neon_FOUND ${sse2neon_FOUND})
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(sse2neon_FOUND AND NOT TARGET sse2neon)
-    # INTERFACE type since we know that this is a header-only library.
-    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-    set(_sse2neon_TARGET_CREATE TRUE)
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(_sse2neon_TARGET_CREATE)
-    set_target_properties(sse2neon PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
-    )
-
-    mark_as_advanced(sse2neon_INCLUDE_DIR)
-endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake
index 16b47798cd..5f0f810ca1 100644
--- a/share/cmake/modules/install/Installsse2neon.cmake
+++ b/share/cmake/modules/install/Installsse2neon.cmake
@@ -29,8 +29,15 @@ if(NOT sse2neon_POPULATED)
   set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}")
   file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon")
 
-  add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-  set_target_properties(sse2neon PROPERTIES
-      INTERFACE_INCLUDE_DIRECTORIES "${_EXT_DIST_INCLUDE}/sse2neon"
-  )
+  # sse2neon_INCLUDE_DIR is used internally for CheckSupportSSE2.cmake and to create sse2neon
+  # target for OCIO.
+  set(sse2neon_INCLUDE_DIR "${sse2neon_SOURCE_DIR}")
+
+  # Any changes to the following lines must be replicated in ./CMakeLists.txt as well.
+  # Create a target for sse2neon (non-imported)
+  add_library(sse2neon INTERFACE)
+  # Add the include directories to the target.
+  target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
+  # Ignore the warnings coming from sse2neon.h as they are false positives.
+  target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
 endif()
\ No newline at end of file
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 26bc0b396d..07fecbd7a5 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -4,7 +4,7 @@
 include(CheckCXXSourceCompiles)
 
 set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
-set(_cmake_required_libraries_orig "${CMAKE_REQUIRED_LIBRARIES}")
+set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}")
 set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
@@ -49,7 +49,7 @@ elseif(APPLE AND HAVE_NEON)
         set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
         
         if(current_arch STREQUAL arm64)
-            set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+            set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR})
             set(_sse2_header_ "#include <sse2neon.h>")
             set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
         elseif(current_arch STREQUAL x86_64)
@@ -63,11 +63,11 @@ elseif(APPLE AND HAVE_NEON)
 endif()
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}")
-set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_orig}")
+set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}")
 set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
 unset(_cmake_required_flags_orig)
-unset(_cmake_required_libraries_orig)
+unset(_cmake_required_includes_orig)
 unset(_cmake_osx_architectures_orig)
 
 
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 791ade3f91..608cbac63a 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -293,10 +293,7 @@ target_link_libraries(OpenColorIO
 )
 
 if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
-    target_link_libraries(OpenColorIO
-        PRIVATE
-            sse2neon
-    )
+    target_link_libraries(OpenColorIO PRIVATE $<BUILD_INTERFACE:sse2neon>)
 endif()
 
 if(APPLE)
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 245caeb9cb..edbb21ad2c 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -27,10 +27,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     )
 
     if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
-        target_link_libraries(${TEST_BINARY}
-            PRIVATE
-                sse2neon
-        )
+        target_link_libraries(${TEST_BINARY} PRIVATE sse2neon)
     endif()
 
     if(APPLE)

From db3db1ac556215fb93711054035eb8f7bf5fbf7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 24 Mar 2023 10:15:24 -0400
Subject: [PATCH 33/81] Fixing an issue during rebase. Removing
 ocio_check_dependency_version.cmake as it is not needed anymore. Changed how
 we detect if the installed OpenEXR and OpenImageIO are compatible with OCIO.
 It should be more robust now.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                |  1 -
 .../macros/CheckForOpenEXRCompatibility.cmake | 66 +++++++++++++++++++
 .../ocio_check_dependency_version.cmake       | 38 -----------
 share/cmake/modules/FindExtPackages.cmake     | 13 ++--
 share/cmake/utils/CompilerFlags.cmake         |  3 -
 5 files changed, 70 insertions(+), 51 deletions(-)
 create mode 100644 share/cmake/macros/CheckForOpenEXRCompatibility.cmake
 delete mode 100644 share/cmake/macros/ocio_check_dependency_version.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e972eb8023..2752af64fb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,7 +186,6 @@ option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)
 # TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
-option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
 ###############################################################################
diff --git a/share/cmake/macros/CheckForOpenEXRCompatibility.cmake b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake
new file mode 100644
index 0000000000..bd79eb96e1
--- /dev/null
+++ b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Check for compatibility between OpenEXR and OpenImageIO since OCIO requires OpenEXR 3+.
+#
+
+message(STATUS "Checking if the OpenImageIO found is built with OpenEXR 3+...")
+
+find_path (OpenImageIO_INCLUDE_DIR
+    NAMES
+        OpenImageIO/imageio.h
+    HINTS
+        ${OpenImageIO_ROOT}
+        # Assuming that OpenImageIO was installed normally, go back a few folders down
+        # to get the equivalent of OpenImageIO_ROOT.
+        ${OpenImageIO_DIR}/../../..
+    PATH_SUFFIXES
+        OpenImageIO/include
+        include
+)
+
+if (NOT OpenImageIO_INCLUDE_DIR)
+    message(STATUS "${ColorWarning}Could not find OpenImageIO header to evaluate the OpenEXR version.")
+    message(STATUS "Please provide the OpenImageIO_DIR variable.")
+    message(STATUS "If your OpenImageIO's files are located in different root directory, \
+please provide the OpenImageIO_ROOT where the include files are located.${ColorReset}")
+endif()
+
+# Try to figure out version number
+set (OIIO_VERSION_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/oiioversion.h")
+if (EXISTS "${OIIO_VERSION_HEADER}")
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MAJOR .*$")
+    string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MAJOR ${TMP})
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MINOR .*$")
+    string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MINOR ${TMP})
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_PATCH .*$")
+    string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_PATCH ${TMP})
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_TWEAK .*$")
+    if (TMP)
+        string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_TWEAK ${TMP})
+    else ()
+        set (OpenImageIO_VERSION_TWEAK 0)
+    endif ()
+    set (OpenImageIO_VERSION "${OpenImageIO_VERSION_MAJOR}.${OpenImageIO_VERSION_MINOR}.${OpenImageIO_VERSION_PATCH}.${OpenImageIO_VERSION_TWEAK}")
+endif ()
+
+set (OIIO_IMATH_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/Imath.h")
+if (EXISTS "${OIIO_IMATH_HEADER}")
+    file(STRINGS "${OIIO_IMATH_HEADER}" TMP REGEX "^#define OIIO_USING_IMATH .*$")
+    string(REGEX MATCHALL "[0-9]" OIIO_IMATH_VERSION ${TMP})
+    if (OIIO_IMATH_VERSION LESS 3)
+        message(STATUS "Skipping OpenImageIO built against OpenEXR 2, please use version 3 or greater.")
+    else()
+        set(is_OpenEXR_VERSION_valid TRUE)
+    endif()
+endif()
+
+# clean up variables
+unset(OpenImageIO_INCLUDE_DIR)
+unset(OIIO_VERSION_HEADER)
+unset(OIIO_VERSION_MAJOR)
+unset(OIIO_VERSION_MINOR)
+unset(OIIO_VERSION_PATCH)
+unset(OIIO_VERSION_TWEAK)
+unset(OIIO_IMATH_HEADER)
+unset(OIIO_IMATH_VERSION)
\ No newline at end of file
diff --git a/share/cmake/macros/ocio_check_dependency_version.cmake b/share/cmake/macros/ocio_check_dependency_version.cmake
deleted file mode 100644
index 2c2b741192..0000000000
--- a/share/cmake/macros/ocio_check_dependency_version.cmake
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-
-###################################################################################################
-# ocio_check_dependency_version try to find the specified dependency and validate the version.
-#
-# Note that a function is used here to scoped-in any variables set by find_package. We do not want
-# those variables to be propagated to the caller of the function.
-#
-# Argument:
-#   dep_name is the name of the dependency (package). Please note that dep_name is case sensitive.
-#
-###################################################################################################
-
-function (ocio_check_dependency_version dep_name output)
-    cmake_parse_arguments(
-        # prefix - Must be different than the one used in ocio_handle_dependency.cmake.
-        ocio_cdv
-        # options
-        ""
-        # one value keywords
-        "MIN_VERSION"
-        # multi value keywords
-        ""
-        # args
-        ${ARGN})
-        
-    if (dep_name)
-        find_package(${dep_name} ${ocio_cdv_UNPARSED_ARGUMENTS})
-        if (ocio_cdv_MIN_VERSION AND ${dep_name}_VERSION)
-            if (${${dep_name}_VERSION} VERSION_GREATER_EQUAL ocio_cdv_MIN_VERSION)
-                set(${output} TRUE)
-            else()
-                set(${output} FALSE)
-            endif()
-        endif()
-    endif()
-endfunction()
\ No newline at end of file
diff --git a/share/cmake/modules/FindExtPackages.cmake b/share/cmake/modules/FindExtPackages.cmake
index 870b039bfc..48bc82de07 100644
--- a/share/cmake/modules/FindExtPackages.cmake
+++ b/share/cmake/modules/FindExtPackages.cmake
@@ -197,13 +197,10 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS)
     # Supported from OIIO 2.4+. Setting this for lower versions doesn't affect anything.
     set(OPENIMAGEIO_CONFIG_DO_NOT_FIND_IMATH 1)
 
-    include(ocio_check_dependency_version)
-    # Since OpenImageIO will try to find OpenEXR through its OpenImageIOConfig.cmake file,
-    # let's try to find OpenEXR first and if the version is too old, OCIO will not try to find
-    # OpenImageIO.
-    ocio_check_dependency_version(  OpenEXR "is_OpenEXR_VERSION_valid" 
-                                    MIN_VERSION ${OpenEXR_MININUM_VERSION} 
-                                    CONFIG)
+    set(is_OpenEXR_VERSION_valid FALSE)
+    # Check for compatibility between OpenEXR and OpenImageIO.
+    # Will set is_OpenEXR_VERSION_valid to TRUE if valid.
+    include(CheckForOpenEXRCompatibility)
 
     # Do not try to find OpenImageIO if the version of OpenEXR is too old.                                    
     if (is_OpenEXR_VERSION_valid)
@@ -227,8 +224,6 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS)
                                  MIN_VERSION ${OIIO_VERSION}
                                  RECOMMENDED_VERSION ${OIIO_RECOMMENDED_VERSION}
                                  PROMOTE_TARGET OpenImageIO::OpenImageIO)
-    else()
-        message(WARNING "Skipping OpenImageIO because the OpenEXR found by OpenImageIO is too old (under ${OpenEXR_MININUM_VERSION})")
     endif()
 endif()
 
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 44fe79feaa..d9aa05d43f 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -106,7 +106,6 @@ set_unless_defined(CMAKE_C_VISIBILITY_PRESET hidden)
 set_unless_defined(CMAKE_CXX_VISIBILITY_PRESET hidden)
 set_unless_defined(CMAKE_VISIBILITY_INLINES_HIDDEN YES)
 
-<<<<<<< HEAD
 
 ###############################################################################
 # Define if SSE2 can be used.
@@ -121,8 +120,6 @@ if(NOT HAVE_SSE2)
 endif(NOT HAVE_SSE2)
 
 
-=======
->>>>>>> 5585483c9f407d03923cd44f3d802a6111ec62fe
 ###############################################################################
 # Define RPATH.
 

From 363fb534d90cc4a389a34a82d72992b269f76024 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Fri, 2 Dec 2022 14:23:01 -0500
Subject: [PATCH 34/81] Increasing version to 2.3.0 (#1717)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5089e735a..d7ea2b176f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ endif()
 # Project definition.
 
 project(OpenColorIO 
-    VERSION 2.2.0
+    VERSION 2.3.0
     DESCRIPTION "OpenColorIO (OCIO) is a complete color management solution"
     HOMEPAGE_URL https://github.com/AcademySoftwareFoundation/OpenColorIO
     LANGUAGES CXX C)

From 7e78a5baeab2dc317dc46fd657c215c97102eea6 Mon Sep 17 00:00:00 2001
From: bartstyczen <93516126+bartstyczen@users.noreply.github.com>
Date: Sat, 3 Dec 2022 01:56:03 +0100
Subject: [PATCH 35/81] replace texture2D function with texture for GLSL 1.3
 (#1723)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Bart Styczen <bart.styczen@cine.dev>

Signed-off-by: Bart Styczen <bart.styczen@cine.dev>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/GpuShaderUtils.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/OpenColorIO/GpuShaderUtils.cpp b/src/OpenColorIO/GpuShaderUtils.cpp
index 83689d8ff1..a3adefbdc0 100644
--- a/src/OpenColorIO/GpuShaderUtils.cpp
+++ b/src/OpenColorIO/GpuShaderUtils.cpp
@@ -143,11 +143,15 @@ std::string getTexSample(GpuLanguage lang,
     switch (lang)
     {
         case GPU_LANGUAGE_GLSL_1_2:
-        case GPU_LANGUAGE_GLSL_1_3:
         {
             kw << "texture" << N << "D(" << samplerName << ", " << coords << ")";
             break;
         }
+        case GPU_LANGUAGE_GLSL_1_3:
+        {
+            kw << "texture(" << samplerName << ", " << coords << ")";
+            break;
+        }
         case GPU_LANGUAGE_GLSL_ES_1_0:
         {
             if (N == 1) {

From e9b8b5291a8ceb90e7d5cb11f466cb080b4e23cd Mon Sep 17 00:00:00 2001
From: FantasqueX <voidren@outlook.com>
Date: Sat, 3 Dec 2022 11:34:58 +0800
Subject: [PATCH 36/81] CheckSupportSSE2: Fix sse flags unexpected propagation
 (#1721)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set function will affects all variables in current directory. If sse
flags are added in CheckSupportSSE2.cmake, they will remain in
Findminizip-ng.cmake which will cause liblzma cannot be detected. This
patch keep CMAKE_REQUIRED_FLAGS being changed only in current file
scope. This patch has been tested on riscv64.

Signed-off-by: Letu Ren <fantasquex@gmail.com>

Signed-off-by: Letu Ren <fantasquex@gmail.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/cmake/utils/CheckSupportSSE2.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 3d97bf8a0a..f30bbb763c 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -3,6 +3,8 @@
 
 include(CheckCXXSourceCompiles)
 
+set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
+
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
     # unrelated warnings causing a detection failure. So, the code disables all warnings to focus
@@ -27,4 +29,7 @@ check_cxx_source_compiles ("
     }"
     HAVE_SSE2)
 
+set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
+unset(_cmake_required_flags_old)
+
 mark_as_advanced(HAVE_SSE2)

From 2ac4ab956a2de66c40d7f5105da9aa8b070009d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Mon, 5 Dec 2022 17:33:12 -0500
Subject: [PATCH 37/81] Adsk contrib - Fix issue with minizip build (#1725)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* - Refactoring how OCIO search for minizip-ng. The first step is to search for an external minizip-ng. If not found, search for minizip-ng with MZ_COMPAT=ON (libminizip). If it is not found either, download and install minizip-ng with MZ_COMPAT=OFF.
- Removing the minizip-ng part for the includes for minizip-ng headers.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Update comments

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Improved find_package in Config mode (adding it back)
Added missing scripts to install minizip-ng and zlib for the analysis workflow

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding +x permissions for install_minizip_ng and zlib
Fixing path to find zlib in install_minizip-ng.sh

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Changing target name to match the one used by minizip-ng library (+ using the imported target instead of creating a new one when minizip-ng is found)

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .github/workflows/analysis_workflow.yml      |   9 +
 share/ci/scripts/multi/install_minizip-ng.sh |  57 ++++
 share/ci/scripts/multi/install_zlib.sh       |  40 +++
 share/cmake/modules/Findminizip-ng.cmake     | 289 ++++++++++++-------
 share/cmake/modules/Findminizip.cmake        | 172 +++++++++++
 src/OpenColorIO/CMakeLists.txt               |   2 +-
 src/OpenColorIO/OCIOZArchive.cpp             |  20 +-
 src/apps/ocioarchive/CMakeLists.txt          |   2 +-
 src/apps/ocioarchive/main.cpp                |  10 +-
 tests/cpu/CMakeLists.txt                     |   2 +-
 10 files changed, 479 insertions(+), 124 deletions(-)
 create mode 100755 share/ci/scripts/multi/install_minizip-ng.sh
 create mode 100755 share/ci/scripts/multi/install_zlib.sh
 create mode 100644 share/cmake/modules/Findminizip.cmake

diff --git a/.github/workflows/analysis_workflow.yml b/.github/workflows/analysis_workflow.yml
index fb5d8313b7..3b7443da57 100644
--- a/.github/workflows/analysis_workflow.yml
+++ b/.github/workflows/analysis_workflow.yml
@@ -104,12 +104,15 @@ jobs:
         run: |
           share/ci/scripts/multi/install_pugixml.sh latest
       - name: Install fixed ext package versions
+        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
         run: |
           share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
           share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
           share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
           share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
           share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
+          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
+          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
       - name: Install latest ext package versions
         run: |
           share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
@@ -206,12 +209,15 @@ jobs:
           share/ci/scripts/macos/install_boost.sh latest
           share/ci/scripts/multi/install_pugixml.sh latest $EXT_PATH
       - name: Install fixed ext package versions
+        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
         run: |
           share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
           share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
           share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
           share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
           share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
+          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
+          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
       - name: Install latest ext package versions
         run: |
           share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
@@ -325,12 +331,15 @@ jobs:
           share/ci/scripts/multi/install_pugixml.sh latest $EXT_PATH
         shell: bash
       - name: Install fixed ext package versions
+        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
         run: |
           share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
           share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
           share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
           share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
           share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
+          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
+          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
         shell: bash
       # OSL not installed due to LLVM compilation time.
       - name: Install latest ext package versions
diff --git a/share/ci/scripts/multi/install_minizip-ng.sh b/share/ci/scripts/multi/install_minizip-ng.sh
new file mode 100755
index 0000000000..b8c5aab75a
--- /dev/null
+++ b/share/ci/scripts/multi/install_minizip-ng.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+set -ex
+
+MINIZIPNG_VERSION="$1"
+INSTALL_TARGET="$2"
+
+MINIZIPNG_MAJOR_MINOR=$(echo "${MINIZIPNG_VERSION}" | cut   -d. -f-2)
+MINIZIPNG_MAJOR=$(echo "${MINIZIPNG_VERSION}" | cut   -d. -f-1)
+MINIZIPNG_MINOR=$(echo "${MINIZIPNG_MAJOR_MINOR}" | cut   -d. -f2-)
+MINIZIPNG_PATCH=$(echo "${MINIZIPNG_VERSION}" | cut   -d. -f3-)
+MINIZIPNG_VERSION_U="${MINIZIPNG_MAJOR}.${MINIZIPNG_MINOR}.${MINIZIPNG_PATCH}"
+
+git clone https://github.com/zlib-ng/minizip-ng
+cd minizip-ng
+
+if [ "$MINIZIPNG_VERSION" == "latest" ]; then
+    LATEST_TAG=$(git describe --abbrev=0 --tags)
+    git checkout tags/${LATEST_TAG} -b ${LATEST_TAG}
+else
+    git checkout tags/${MINIZIPNG_VERSION_U} -b ${MINIZIPNG_VERSION_U}
+fi
+
+mkdir build
+cd build
+
+cmake -DCMAKE_BUILD_TYPE=Release \
+      ${INSTALL_TARGET:+"-DCMAKE_INSTALL_PREFIX="${INSTALL_TARGET}""} \
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+      -DBUILD_SHARED_LIBS=OFF \
+      -DMZ_OPENSSL=OFF \
+      -DMZ_LIBBSD=OFF \
+      -DMZ_BUILD_TESTS=OFF \
+      -DMZ_COMPAT=OFF \
+      -DMZ_BZIP2=OFF \
+      -DMZ_LZMA=OFF \
+      -DMZ_LIBCOMP=OFF \
+      -DMZ_ZSTD=OFF \
+      -DMZ_PKCRYPT=OFF \
+      -DMZ_WZAES=OFF \
+      -DMZ_SIGNING=OFF \
+      -DMZ_ZLIB=ON \
+      -DMZ_ICONV=OFF \
+      -DMZ_FETCH_LIBS=OFF \
+      -DMZ_FORCE_FETCH_LIBS=OFF \
+      -DZLIB_LIBRARY=${INSTALL_TARGET}/${CMAKE_INSTALL_LIBDIR} \
+      -DZLIB_INCLUDE_DIR=${INSTALL_TARGET}/include \
+      ../.
+cmake --build . \
+      --target install \
+      --config Release \
+      --parallel 2
+
+cd ../..
+rm -rf minizip-ng
diff --git a/share/ci/scripts/multi/install_zlib.sh b/share/ci/scripts/multi/install_zlib.sh
new file mode 100755
index 0000000000..1bd5460641
--- /dev/null
+++ b/share/ci/scripts/multi/install_zlib.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+set -ex
+
+ZLIB_VERSION="$1"
+INSTALL_TARGET="$2"
+
+ZLIB_MAJOR_MINOR=$(echo "${ZLIB_VERSION}" | cut   -d. -f-2)
+ZLIB_MAJOR=$(echo "${ZLIB_VERSION}" | cut   -d. -f-1)
+ZLIB_MINOR=$(echo "${ZLIB_MAJOR_MINOR}" | cut   -d. -f2-)
+ZLIB_PATCH=$(echo "${ZLIB_VERSION}" | cut   -d. -f3-)
+ZLIB_VERSION_U="${ZLIB_MAJOR}.${ZLIB_MINOR}.${ZLIB_PATCH}"
+
+git clone https://github.com/madler/zlib
+cd zlib
+
+if [ "$ZLIB_VERSION" == "latest" ]; then
+    LATEST_TAG=$(git describe --abbrev=0 --tags)
+    git checkout tags/${LATEST_TAG} -b ${LATEST_TAG}
+else
+    git checkout tags/v${ZLIB_VERSION_U} -b v${ZLIB_VERSION_U}
+fi
+
+mkdir build
+cd build
+
+cmake -DCMAKE_BUILD_TYPE=Release \
+      ${INSTALL_TARGET:+"-DCMAKE_INSTALL_PREFIX="${INSTALL_TARGET}""} \
+      -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+      -DBUILD_SHARED_LIBS=OFF \
+      ../.
+cmake --build . \
+      --target install \
+      --config Release \
+      --parallel 2
+
+cd ../..
+rm -rf zlib
diff --git a/share/cmake/modules/Findminizip-ng.cmake b/share/cmake/modules/Findminizip-ng.cmake
index 528d1d0d3e..b0136abc5f 100644
--- a/share/cmake/modules/Findminizip-ng.cmake
+++ b/share/cmake/modules/Findminizip-ng.cmake
@@ -3,6 +3,12 @@
 #
 # Locate or install minizip-ng
 #
+# This module will try to do the following:
+# 1) Locate minizip-ng
+# 2) If minizip-ng cannot be found, it will try to find minizip (minizip-ng with MZ_COMPAT=ON)
+# 3) If minizip-ng with MZ_COMPAT=ON cannot be found, minizip-ng will be downloaded, built and
+#    installed.
+#
 # Variables defined by this module:
 #   minizip-ng_FOUND        - If FALSE, do not try to link to minizip-ng
 #   minizip-ng_LIBRARY      - minizip-ng library to link to
@@ -10,7 +16,18 @@
 #   minizip-ng_VERSION      - The version of the library
 #
 # Targets defined by this module:
-#   minizip-ng::minizip-ng - IMPORTED target, if found
+#   MINIZIP::minizip-ng - IMPORTED target, if found
+#
+# If minizip-ng is not installed in a standard path, you can use the minizip-ng_ROOT 
+# variable to tell CMake where to find it. If it is not found and 
+# OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, minizip-ng will be downloaded, 
+# built, and statically-linked into libOpenColorIO at build time.
+#
+# For external builds of minizip-ng, please note that the same build options should be used.
+# Using more options, such as enabling other compression methods, will provoke linking issue
+# since OCIO is not linking to those libraries.
+#
+# e.g. Setting MZ_BZIP2=ON will cause linking issue since OCIO will not be linked against BZIP2.
 #
 ###############################################################################
 ### Try to find package ###
@@ -18,13 +35,58 @@
 if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     if(NOT DEFINED minizip-ng_ROOT)
         # Search for minizip-ng-config.cmake
-        find_package(minizip-ng ${minizip-ng_FIND_VERSION} CONFIG)
+        find_package(minizip-ng ${minizip-ng_FIND_VERSION} CONFIG QUIET)
     endif()
 
     if (minizip-ng_FOUND)
-        get_target_property(minizip-ng_LIBRARY MINIZIP::minizip-ng LOCATION)
         get_target_property(minizip-ng_INCLUDE_DIR MINIZIP::minizip-ng INTERFACE_INCLUDE_DIRECTORIES)
-    else ()
+        get_target_property(minizip-ng_LIBRARY MINIZIP::minizip-ng LOCATION)
+        
+        # Depending on the options used when minizip-ng was built, it could have multiple libraries
+        # listed in INTERFACE_LINK_LIBRARIES. OCIO only needs ZLIB.
+        # Only add custom zlib target ZLIB::ZLIB to INTERFACE_LINK_LIBRARIES.
+        set_target_properties(MINIZIP::minizip-ng PROPERTIES INTERFACE_LINK_LIBRARIES "ZLIB::ZLIB")
+
+        if (NOT minizip-ng_LIBRARY)
+            # Lib names to search for
+            set(_minizip-ng_LIB_NAMES minizip-ng)
+
+            if(BUILD_TYPE_DEBUG)
+                # Prefer Debug lib names (Windows only)
+                list(INSERT _minizip-ng_LIB_NAMES 0 minizip-ngd)
+            endif()
+
+            if(minizip-ng_STATIC_LIBRARY)
+                # Prefer static lib names
+                set(_minizip-ng_STATIC_LIB_NAMES 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}minizip-ng${CMAKE_STATIC_LIBRARY_SUFFIX}")
+                if(WIN32 AND BUILD_TYPE_DEBUG)
+                    # Prefer static Debug lib names (Windows only)
+                    list(INSERT _minizip-ng_STATIC_LIB_NAMES 0
+                        "${CMAKE_STATIC_LIBRARY_PREFIX}minizip-ngd${CMAKE_STATIC_LIBRARY_SUFFIX}")
+                endif()
+            endif()
+
+            # Find library
+            find_library(minizip-ng_LIBRARY
+                NAMES
+                    ${_minizip-ng_STATIC_LIB_NAMES}
+                    ${_minizip-ng_LIB_NAMES}
+                HINTS
+                    ${minizip-ng_ROOT}
+                    ${PC_minizip-ng_LIBRARY_DIRS}
+                PATH_SUFFIXES
+                    lib64 lib 
+            )
+
+            # Set IMPORTED_LOCATION property for MINIZIP::minizip-ng target.
+            if (TARGET MINIZIP::minizip-ng)
+                set_target_properties(MINIZIP::minizip-ng PROPERTIES
+                    IMPORTED_LOCATION "${minizip-ng_LIBRARY}"
+                )
+            endif()
+        endif()
+    else()
         list(APPEND _minizip-ng_REQUIRED_VARS minizip-ng_INCLUDE_DIR)
 
         # Search for minizip-ng.pc
@@ -40,11 +102,13 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
                 ${PC_minizip-ng_INCLUDE_DIRS}
             PATH_SUFFIXES
                 include
+                include/minizip-ng
                 minizip-ng/include
         )
 
         # Lib names to search for
-        set(_minizip-ng_LIB_NAMES minizip-ng libminizip-ng)
+        set(_minizip-ng_LIB_NAMES minizip-ng)
+
         if(BUILD_TYPE_DEBUG)
             # Prefer Debug lib names (Windows only)
             list(INSERT _minizip-ng_LIB_NAMES 0 minizip-ngd)
@@ -92,127 +156,140 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     )
 endif()
 
-###############################################################################
-### Create target
-
-if(NOT TARGET minizip-ng::minizip-ng)
-    add_library(minizip-ng::minizip-ng UNKNOWN IMPORTED GLOBAL)
-    set(_minizip-ng_TARGET_CREATE TRUE)
+if(NOT minizip-ng_FOUND)
+    # Looking for an external minizip-ng that might be built using MZ_COMPAT=ON.
+    # But do not download it if it cannot be found.
+    find_package(minizip ${minizip-ng_FIND_VERSION} REQUIRED)
 endif()
 
-###############################################################################
-### Install package from source ###
-if(NOT minizip-ng_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-    include(GNUInstallDirs)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(minizip-ng_FOUND TRUE)
-    set(minizip-ng_VERSION ${minizip-ng_FIND_VERSION})
-    set(minizip-ng_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
-
-    # Minizip-ng use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
-    set(_minizip-ng_LIB_PREFIX "lib")
-
-    set(minizip-ng_LIBRARY
-        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_minizip-ng_LIB_PREFIX}minizip-ng${_minizip-ng_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-    if(_minizip-ng_TARGET_CREATE)
-        set(MINIZIP-NG_CMAKE_ARGS
-            ${MINIZIP-NG_CMAKE_ARGS}
-            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
-            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-            -DBUILD_SHARED_LIBS=OFF
-            -DMZ_OPENSSL=OFF
-            -DMZ_LIBBSD=OFF
-            -DMZ_BUILD_TESTS=OFF
-            -DMZ_COMPAT=OFF
-            -DMZ_BZIP2=OFF
-            -DMZ_LZMA=OFF
-            -DMZ_LIBCOMP=OFF
-            -DMZ_ZSTD=OFF
-            -DMZ_PKCRYPT=OFF
-            -DMZ_WZAES=OFF
-            -DMZ_SIGNING=OFF
-            -DMZ_ZLIB=ON
-            -DMZ_ICONV=OFF
-            -DMZ_FETCH_LIBS=OFF
-            -DMZ_FORCE_FETCH_LIBS=OFF
-            -DZLIB_LIBRARY=${ZLIB_LIBRARIES}
-            -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS}
-        )
+if(NOT minizip_FOUND AND NOT TARGET MINIZIP::minizip)
+    ###############################################################################
+    ### Create target
 
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(minizip-ng_CMAKE_ARGS
-                ${minizip-ng_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
+    if(NOT TARGET MINIZIP::minizip-ng)
+        add_library(MINIZIP::minizip-ng UNKNOWN IMPORTED GLOBAL)
+        set(_minizip-ng_TARGET_CREATE TRUE)
+    endif()
+
+    ###############################################################################
+    ### Install package from source ###
+    if(NOT minizip-ng_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+        include(ExternalProject)
+        include(GNUInstallDirs)
+
+        set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+        set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
 
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+        # Set find_package standard args
+        set(minizip-ng_FOUND TRUE)
+        set(minizip-ng_VERSION ${minizip-ng_FIND_VERSION})
 
-            set(minizip-ng_CMAKE_ARGS
-                ${minizip-ng_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+        set(minizip-ng_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng")
+
+        # Minizip-ng use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
+        set(_minizip-ng_LIB_PREFIX "lib")
+
+        set(minizip-ng_LIBRARY
+            "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_minizip-ng_LIB_PREFIX}minizip-ng${_minizip-ng_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+        if(_minizip-ng_TARGET_CREATE)
+            set(MINIZIP-NG_CMAKE_ARGS
+                ${MINIZIP-NG_CMAKE_ARGS}
+                -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
+                -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
+                -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+                -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+                -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+                -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+                -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
+                -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+                # Since the other modules create a subfolder for the includes by default and since
+                # minizip-ng does not, a suffix is added to CMAKE_INSTALL_INCLUDEDIR in order to
+                # install the headers under a subdirectory named "minizip-ng".
+                # Note that this does not affect external builds for minizip-ng.
+                -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng
+                -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+                -DBUILD_SHARED_LIBS=OFF
+                -DMZ_OPENSSL=OFF
+                -DMZ_LIBBSD=OFF
+                -DMZ_BUILD_TESTS=OFF
+                -DMZ_COMPAT=OFF
+                -DMZ_BZIP2=OFF
+                -DMZ_LZMA=OFF
+                -DMZ_LIBCOMP=OFF
+                -DMZ_ZSTD=OFF
+                -DMZ_PKCRYPT=OFF
+                -DMZ_WZAES=OFF
+                -DMZ_SIGNING=OFF
+                -DMZ_ZLIB=ON
+                -DMZ_ICONV=OFF
+                -DMZ_FETCH_LIBS=OFF
+                -DMZ_FORCE_FETCH_LIBS=OFF
+                -DZLIB_LIBRARY=${ZLIB_LIBRARIES}
+                -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS}
             )
-        endif()
 
-        if (ANDROID)
-            set(minizip-ng_CMAKE_ARGS
-                ${minizip-ng_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
+            if(CMAKE_TOOLCHAIN_FILE)
+                set(minizip-ng_CMAKE_ARGS
+                    ${minizip-ng_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+            endif()
+
+            if(APPLE)
+                string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+                set(minizip-ng_CMAKE_ARGS
+                    ${minizip-ng_CMAKE_ARGS}
+                    -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                    -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+                )
+            endif()
+
+            if (ANDROID)
+                set(minizip-ng_CMAKE_ARGS
+                    ${minizip-ng_CMAKE_ARGS}
+                    -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                    -DANDROID_ABI=${ANDROID_ABI}
+                    -DANDROID_STL=${ANDROID_STL})
+            endif()
         endif()
-    endif()
 
-    # Hack to let imported target be built from ExternalProject_Add
-    file(MAKE_DIRECTORY ${minizip-ng_INCLUDE_DIR})
-
-    ExternalProject_Add(minizip-ng_install
-        GIT_REPOSITORY "https://github.com/zlib-ng/minizip-ng.git"
-        GIT_TAG "${minizip-ng_VERSION}"
-        GIT_CONFIG advice.detachedHead=false
-        GIT_SHALLOW TRUE
-        PREFIX "${_EXT_BUILD_ROOT}/libminizip-ng"
-        BUILD_BYPRODUCTS ${minizip-ng_LIBRARY}
-        CMAKE_ARGS ${MINIZIP-NG_CMAKE_ARGS}
-        EXCLUDE_FROM_ALL TRUE
-        BUILD_COMMAND ""
-        INSTALL_COMMAND
-            ${CMAKE_COMMAND} --build .
-                             --config ${CMAKE_BUILD_TYPE}
-                             --target install
-                             --parallel
-    )
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${minizip-ng_INCLUDE_DIR})
 
-    add_dependencies(minizip-ng::minizip-ng minizip-ng_install)
-    message(STATUS "Installing minizip-ng: ${minizip-ng_LIBRARY} (version \"${minizip-ng_VERSION}\")")
+        ExternalProject_Add(minizip-ng_install
+            GIT_REPOSITORY "https://github.com/zlib-ng/minizip-ng.git"
+            GIT_TAG "${minizip-ng_VERSION}"
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/libminizip-ng"
+            BUILD_BYPRODUCTS ${minizip-ng_LIBRARY}
+            CMAKE_ARGS ${MINIZIP-NG_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+            BUILD_COMMAND ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} --build .
+                                --config ${CMAKE_BUILD_TYPE}
+                                --target install
+                                --parallel
+        )
+
+        add_dependencies(MINIZIP::minizip-ng minizip-ng_install)
+        message(STATUS "Installing minizip-ng: ${minizip-ng_LIBRARY} (version \"${minizip-ng_VERSION}\")")
+    endif()
 endif()
 
 ###############################################################################
 ### Configure target ###
 
 if(_minizip-ng_TARGET_CREATE)
-    set_target_properties(minizip-ng::minizip-ng PROPERTIES
+    set_target_properties(MINIZIP::minizip-ng PROPERTIES
         IMPORTED_LOCATION "${minizip-ng_LIBRARY}"
         INTERFACE_INCLUDE_DIRECTORIES "${minizip-ng_INCLUDE_DIR}"
     )
 
     mark_as_advanced(minizip-ng_INCLUDE_DIR minizip-ng_LIBRARY minizip-ng_VERSION)
 
-    target_link_libraries(minizip-ng::minizip-ng INTERFACE ZLIB::ZLIB)
+    target_link_libraries(MINIZIP::minizip-ng INTERFACE ZLIB::ZLIB)
 endif()
\ No newline at end of file
diff --git a/share/cmake/modules/Findminizip.cmake b/share/cmake/modules/Findminizip.cmake
new file mode 100644
index 0000000000..1a75747177
--- /dev/null
+++ b/share/cmake/modules/Findminizip.cmake
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Locate minizip-ng with MZ_COMPAT=ON.
+# 
+# This module DOES NOT install minizip-ng with MZ_COMPAT=ON if it is not found.
+#
+# Note: This option changes the name for the library file to "libminizip", but it is still
+#       actually minizip-ng. OCIO uses the API from minizip-ng.
+#
+# Variables defined by this module:
+#   minizip_FOUND        - If FALSE, do not try to link to minizip
+#   minizip_LIBRARY      - minizip library to link to
+#   minizip_INCLUDE_DIR  - Where to find mz.h and other headers
+#   minizip_VERSION      - The version of the library
+#
+#   This module set the variables below because this is still minizip-ng. The librarie become
+#   "minizip" because of the cmake option MZ_COMPAT=ON.
+#
+#   minizip-ng_FOUND        - If FALSE, do not try to link to minizip-ng
+#   minizip-ng_LIBRARY      - minizip-ng library to link to
+#   minizip-ng_INCLUDE_DIR  - Where to find mz.h and other headers
+#   minizip-ng_VERSION      - The version of the library
+#
+# Targets defined by this module:
+#   MINIZIP::minizip-ng - IMPORTED target, if found
+#
+###############################################################################
+### Try to find package ###
+
+if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
+
+    if(NOT DEFINED minizip_ROOT)
+        # Search for minizip-config.cmake
+        find_package(minizip ${minizip_FIND_VERSION} CONFIG QUIET)
+    endif()
+
+    if (minizip_FOUND)
+        get_target_property(minizip_INCLUDE_DIR MINIZIP::minizip INTERFACE_INCLUDE_DIRECTORIES)
+        get_target_property(minizip_LIBRARY MINIZIP::minizip LOCATION)
+
+        if (NOT minizip_LIBRARY)
+            # Lib names to search for
+            set(_minizip_LIB_NAMES minizip)
+
+            if(BUILD_TYPE_DEBUG)
+                # Prefer Debug lib names (Windows only)
+                list(INSERT _minizip_LIB_NAMES 0 minizipd)
+            endif()
+
+            if(minizip_STATIC_LIBRARY)
+                # Prefer static lib names
+                set(_minizip_STATIC_LIB_NAMES 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}minizip${CMAKE_STATIC_LIBRARY_SUFFIX}")
+                if(WIN32 AND BUILD_TYPE_DEBUG)
+                    # Prefer static Debug lib names (Windows only)
+                    list(INSERT _minizip_STATIC_LIB_NAMES 0
+                        "${CMAKE_STATIC_LIBRARY_PREFIX}minizipd${CMAKE_STATIC_LIBRARY_SUFFIX}")
+                endif()
+            endif()
+
+            # Find library
+            find_library(minizip_LIBRARY
+                NAMES
+                    ${_minizip_STATIC_LIB_NAMES}
+                    ${_minizip_LIB_NAMES}
+                HINTS
+                    ${minizip_ROOT}
+                    ${PC_minizip_LIBRARY_DIRS}
+                PATH_SUFFIXES
+                    lib64 lib 
+            )
+        endif()
+    else()
+        list(APPEND _minizip_REQUIRED_VARS minizip_INCLUDE_DIR)
+
+        # Search for minizip.pc
+        find_package(PkgConfig QUIET)
+        pkg_check_modules(PC_minizip QUIET "minizip>=${minizip_FIND_VERSION}")
+
+        # Find include directory
+        find_path(minizip_INCLUDE_DIR
+            NAMES
+                mz.h
+            HINTS
+                ${minizip_ROOT}
+                ${PC_minizip_INCLUDE_DIRS}
+            PATH_SUFFIXES
+                include
+                minizip/include
+        )
+
+        # Lib names to search for
+        set(_minizip_LIB_NAMES minizip)
+
+        if(BUILD_TYPE_DEBUG)
+            # Prefer Debug lib names (Windows only)
+            list(INSERT _minizip_LIB_NAMES 0 minizipd)
+        endif()
+
+        if(minizip_STATIC_LIBRARY)
+            # Prefer static lib names
+            set(_minizip_STATIC_LIB_NAMES 
+                "${CMAKE_STATIC_LIBRARY_PREFIX}minizip${CMAKE_STATIC_LIBRARY_SUFFIX}")
+            if(WIN32 AND BUILD_TYPE_DEBUG)
+                # Prefer static Debug lib names (Windows only)
+                list(INSERT _minizip_STATIC_LIB_NAMES 0
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}minizipd${CMAKE_STATIC_LIBRARY_SUFFIX}")
+            endif()
+        endif()
+
+        # Find library
+        find_library(minizip_LIBRARY
+            NAMES
+                ${_minizip_STATIC_LIB_NAMES}
+                ${_minizip_LIB_NAMES}
+            HINTS
+                ${minizip_ROOT}
+                ${PC_minizip_LIBRARY_DIRS}
+            PATH_SUFFIXES
+                lib64
+                lib 
+        )
+
+        # Get version from header or pkg-config
+        set(minizip_VERSION "${minizip_FIND_VERSION}")
+    endif()
+
+    # Override REQUIRED if package can be installed
+    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
+        set(minizip_FIND_REQUIRED FALSE)
+    endif()
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(minizip
+        REQUIRED_VARS 
+            minizip_LIBRARY
+            minizip_INCLUDE_DIR
+        VERSION_VAR
+            minizip_VERSION
+    )
+endif()
+
+###############################################################################
+### Create target
+
+if(minizip_FOUND AND NOT TARGET MINIZIP::minizip-ng)
+    add_library(MINIZIP::minizip-ng UNKNOWN IMPORTED GLOBAL)
+    set(_minizip_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+
+###############################################################################
+### Configure target ###
+
+if(minizip_FOUND AND _minizip_TARGET_CREATE)
+    set_target_properties(MINIZIP::minizip-ng PROPERTIES
+        IMPORTED_LOCATION "${minizip_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${minizip_INCLUDE_DIR}"
+    )
+
+    # This is still minizip-ng even though the library is called minizip because of MZ_COMPAT=ON.
+    set(minizip-ng_LIBRARY ${minizip_LIBRARY})
+    set(minizip-ng_INCLUDE_DIR ${minizip_INCLUDE_DIR})
+    set(minizip-ng_FOUND ${minizip_FOUND})
+    set(minizip-ng_VERSION ${minizip_VERSION})
+
+    mark_as_advanced(minizip_INCLUDE_DIR minizip_LIBRARY minizip_VERSION)
+
+    target_link_libraries(MINIZIP::minizip-ng INTERFACE ZLIB::ZLIB)
+endif()
\ No newline at end of file
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index ab9c048126..2bc379e118 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -277,7 +277,7 @@ target_link_libraries(OpenColorIO
         "$<BUILD_INTERFACE:utils::strings>"
         "$<BUILD_INTERFACE:xxHash>"
         yaml-cpp
-        minizip-ng::minizip-ng
+        MINIZIP::minizip-ng
 )
 
 if(APPLE)
diff --git a/src/OpenColorIO/OCIOZArchive.cpp b/src/OpenColorIO/OCIOZArchive.cpp
index 52e292ef77..85fc7bb76d 100644
--- a/src/OpenColorIO/OCIOZArchive.cpp
+++ b/src/OpenColorIO/OCIOZArchive.cpp
@@ -17,16 +17,16 @@
 
 #include "OCIOZArchive.h"
 
-#include "minizip-ng/mz.h"
-#include "minizip-ng/mz_os.h"
-#include "minizip-ng/mz_strm.h"
-#include "minizip-ng/mz_strm_buf.h"
-#include "minizip-ng/mz_strm_mem.h"
-#include "minizip-ng/mz_strm_os.h"
-#include "minizip-ng/mz_strm_split.h"
-#include "minizip-ng/mz_strm_zlib.h"
-#include "minizip-ng/mz_zip.h"
-#include "minizip-ng/mz_zip_rw.h"
+#include "mz.h"
+#include "mz_os.h"
+#include "mz_strm.h"
+#include "mz_strm_buf.h"
+#include "mz_strm_mem.h"
+#include "mz_strm_os.h"
+#include "mz_strm_split.h"
+#include "mz_strm_zlib.h"
+#include "mz_zip.h"
+#include "mz_zip_rw.h"
 
 namespace OCIO_NAMESPACE
 {
diff --git a/src/apps/ocioarchive/CMakeLists.txt b/src/apps/ocioarchive/CMakeLists.txt
index d71d719aeb..6b868d1979 100644
--- a/src/apps/ocioarchive/CMakeLists.txt
+++ b/src/apps/ocioarchive/CMakeLists.txt
@@ -19,7 +19,7 @@ target_link_libraries(ocioarchive
     PRIVATE
         apputils
         OpenColorIO
-        minizip-ng::minizip-ng
+        MINIZIP::minizip-ng
 )
 
 install(TARGETS ocioarchive
diff --git a/src/apps/ocioarchive/main.cpp b/src/apps/ocioarchive/main.cpp
index bf222fc8aa..190cadee54 100644
--- a/src/apps/ocioarchive/main.cpp
+++ b/src/apps/ocioarchive/main.cpp
@@ -13,11 +13,11 @@ namespace OCIO = OCIO_NAMESPACE;
 #include "apputils/argparse.h"
 
 // Config archive functionality.
-#include "minizip-ng/mz.h"
-#include "minizip-ng/mz_os.h"
-#include "minizip-ng/mz_strm.h"
-#include "minizip-ng/mz_zip.h"
-#include "minizip-ng/mz_zip_rw.h"
+#include "mz.h"
+#include "mz_os.h"
+#include "mz_strm.h"
+#include "mz_zip.h"
+#include "mz_zip_rw.h"
 
 // Array of non OpenColorIO arguments.
 static std::vector<std::string> args;
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index ff3b19dc78..e78a86bede 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -23,7 +23,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             utils::strings
             yaml-cpp
             testutils
-            minizip-ng::minizip-ng
+            MINIZIP::minizip-ng
             xxHash
     )
 

From 3fa84e3219770ba917ab088f8862de037b45cb66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Mon, 5 Dec 2022 20:16:27 -0500
Subject: [PATCH 38/81] Adsk contrib - Processor cache does not detect changes
 in cccid (#1726)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* For Looks that has a FileTransform and for Colorspace with FileTransfrom, add the CCCID to the processor's cache key for that transform.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing the workaround in the related unit tests and fixing the issue by adding the environment variable to the context using setStringVar. The processor's cache key is using the context cache ID which has all the context variables taken into account.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Now using addStringVars and creating a new context instead of reusing the one used for the filename.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding cccid to the context when there are no context variable.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding a few unit tests to test that the processor is different when changing the FileTransform's CCCID.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Using setStringVar to set CCNUM context variable in unit test.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding a test in FileTransform to test CollectContextVariables directly.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Minor tweaks for the unit test

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/transforms/FileTransform.cpp | 10 ++++
 tests/cpu/Config_tests.cpp                   | 55 ++++++++++++++++++++
 tests/cpu/transforms/FileTransform_tests.cpp | 52 ++++++++++++++++++
 tests/python/OCIOZArchiveTest.py             |  6 ---
 4 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/src/OpenColorIO/transforms/FileTransform.cpp b/src/OpenColorIO/transforms/FileTransform.cpp
index 172c242470..4fd4d5d475 100755
--- a/src/OpenColorIO/transforms/FileTransform.cpp
+++ b/src/OpenColorIO/transforms/FileTransform.cpp
@@ -282,6 +282,16 @@ bool CollectContextVariables(const Config &,
         usedContextVars->addStringVars(ctxFilepath);
     }
 
+    // Check if the CCCID is using a context variable and add it to the context if that's the case.
+    ContextRcPtr ctxCCCID = Context::Create();
+    const char * cccid = tr.getCCCId();
+    std::string resolvedCCCID = context.resolveStringVar(cccid, ctxCCCID);
+    if (0 != strcmp(resolvedCCCID.c_str(), cccid))
+    {
+        foundContextVars = true;
+        usedContextVars->addStringVars(ctxCCCID);
+    }
+
     return foundContextVars;
 }
 
diff --git a/tests/cpu/Config_tests.cpp b/tests/cpu/Config_tests.cpp
index 329bd63ef5..76b668fc8e 100644
--- a/tests/cpu/Config_tests.cpp
+++ b/tests/cpu/Config_tests.cpp
@@ -7833,6 +7833,61 @@ OCIO_ADD_TEST(Config, context_variables_typical_use_cases)
                           cfg->getProcessor(ctx2, "cs1", "cs2").get());
         }
     }
+
+
+    // Case 7 - Context variables in the FileTransform's CCCID.
+    {
+        static const std::string CONFIG = 
+            "ocio_profile_version: 2\n"
+            "\n"
+            "environment:\n"
+            "  CCPREFIX: cc\n"
+            "\n"
+            "search_path: " + OCIO::GetTestFilesDir() + "\n"
+            "\n"
+            "roles:\n"
+            "  default: cs1\n"
+            "\n"
+            "displays:\n"
+            "  disp1:\n"
+            "    - !<View> {name: view1, colorspace: cs2}\n"
+            "\n"
+            "colorspaces:\n"
+            "  - !<ColorSpace>\n"
+            "    name: cs1\n"
+            "\n"
+            "  - !<ColorSpace>\n"
+            "    name: cs2\n"
+            "    from_scene_reference: !<FileTransform> {src: cdl_test1.ccc, cccid: $CCPREFIX00$CCNUM}\n";
+
+        std::istringstream iss;
+        iss.str(CONFIG);
+
+        OCIO::ConfigRcPtr cfg;
+        OCIO_CHECK_NO_THROW(cfg = OCIO::Config::CreateFromStream(iss)->createEditableCopy());
+        OCIO_CHECK_NO_THROW(cfg->validate());
+
+        OCIO::ConstTransformRcPtr ctf = cfg->getColorSpace("cs2")->getTransform(
+            OCIO::COLORSPACE_DIR_FROM_REFERENCE
+        );
+        OCIO_REQUIRE_ASSERT(ctf);
+
+        OCIO::ContextRcPtr ctx = cfg->getCurrentContext()->createEditableCopy();
+        
+        ctx->setStringVar("CCNUM", "01");
+        OCIO::ConstProcessorRcPtr p1 = cfg->getProcessor(ctx, ctf, OCIO::TRANSFORM_DIR_FORWARD);
+        
+        ctx->setStringVar("CCNUM", "02");
+        OCIO::ConstProcessorRcPtr p2 = cfg->getProcessor(ctx, ctf, OCIO::TRANSFORM_DIR_FORWARD);
+
+        ctx->setStringVar("CCNUM", "03");
+        OCIO::ConstProcessorRcPtr p3 = cfg->getProcessor(ctx, ctf, OCIO::TRANSFORM_DIR_FORWARD);
+
+        // All three processors should be different.
+        OCIO_CHECK_NE(p1.get(), p2.get());
+        OCIO_CHECK_NE(p1.get(), p3.get());
+        OCIO_CHECK_NE(p2.get(), p3.get());
+    }
 }
 
 OCIO_ADD_TEST(Config, virtual_display)
diff --git a/tests/cpu/transforms/FileTransform_tests.cpp b/tests/cpu/transforms/FileTransform_tests.cpp
index 928adfa6fe..55315822cf 100644
--- a/tests/cpu/transforms/FileTransform_tests.cpp
+++ b/tests/cpu/transforms/FileTransform_tests.cpp
@@ -431,4 +431,56 @@ OCIO_ADD_TEST(FileTransform, context_variables)
 
     // A basic check to validate that context variables are correctly used. 
     OCIO_CHECK_NO_THROW(cfg->getProcessor(ctx, file, OCIO::TRANSFORM_DIR_FORWARD));
+
+
+    {
+        // Case 4 - The 'cccid' now contains a context variable
+        static const std::string CONFIG = 
+            "ocio_profile_version: 2\n"
+            "\n"
+            "environment:\n"
+            "  CCPREFIX: cc\n"
+            "  CCNUM: 02\n"
+            "\n"
+            "search_path: " + OCIO::GetTestFilesDir() + "\n"
+            "\n"
+            "roles:\n"
+            "  default: cs1\n"
+            "\n"
+            "displays:\n"
+            "  disp1:\n"
+            "    - !<View> {name: view1, colorspace: cs2}\n"
+            "\n"
+            "colorspaces:\n"
+            "  - !<ColorSpace>\n"
+            "    name: cs1\n"
+            "\n"
+            "  - !<ColorSpace>\n"
+            "    name: cs2\n"
+            "    from_scene_reference: !<FileTransform> {src: cdl_test1.ccc, cccid: $CCPREFIX00$CCNUM}\n";
+
+            std::istringstream iss;
+            iss.str(CONFIG);
+
+            OCIO::ConstConfigRcPtr cfg;
+            OCIO_CHECK_NO_THROW(cfg = OCIO::Config::CreateFromStream(iss));
+            OCIO_CHECK_NO_THROW(cfg->validate());
+
+            ctx = cfg->getCurrentContext()->createEditableCopy();
+            OCIO_CHECK_NO_THROW(ctx->setStringVar("CCNUM", "01"));
+
+            usedContextVars = OCIO::Context::Create(); // New & empty instance.
+            OCIO::ConstTransformRcPtr tr1 = cfg->getColorSpace("cs2")->getTransform(
+                OCIO::COLORSPACE_DIR_FROM_REFERENCE
+            );
+            OCIO::ConstFileTransformRcPtr fTr1 = OCIO::DynamicPtrCast<const OCIO::FileTransform>(tr1);
+            OCIO_CHECK_ASSERT(fTr1);
+            
+            OCIO_CHECK_ASSERT(CollectContextVariables(*cfg, *ctx, *fTr1, usedContextVars));
+            OCIO_CHECK_EQUAL(2, usedContextVars->getNumStringVars());
+            OCIO_CHECK_EQUAL(std::string("CCPREFIX"), usedContextVars->getStringVarNameByIndex(0));
+            OCIO_CHECK_EQUAL(std::string("cc"), usedContextVars->getStringVarByIndex(0));
+            OCIO_CHECK_EQUAL(std::string("CCNUM"), usedContextVars->getStringVarNameByIndex(1));
+            OCIO_CHECK_EQUAL(std::string("01"), usedContextVars->getStringVarByIndex(1));
+    }
 }
diff --git a/tests/python/OCIOZArchiveTest.py b/tests/python/OCIOZArchiveTest.py
index 1f364101cb..b649badc93 100644
--- a/tests/python/OCIOZArchiveTest.py
+++ b/tests/python/OCIOZArchiveTest.py
@@ -282,9 +282,6 @@ def test_cccid(self):
         cdl = processor.createGroupTransform()[0]
         self.assertEqual(cdl.getSlope()[0], 0.9)
 
-        # FIXME: There is a bug with the Processor cache, this clears it.
-        self.CONFIG.setStrictParsingEnabled(False)
-
         self.CONTEXT['CCCID'] = 'look-03'
         processor = self.CONFIG.getProcessor(self.CONTEXT, 
             look_transform, 
@@ -292,9 +289,6 @@ def test_cccid(self):
         cdl = processor.createGroupTransform()[0]
         self.assertEqual(cdl.getSlope()[0], 1.2)
 
-        # FIXME: There is a bug with the Processor cache, this clears it.
-        self.CONFIG.setStrictParsingEnabled(False)
-
         self.CONTEXT['CCCID'] = 'look-01'
         processor = self.CONFIG.getProcessor(self.CONTEXT, 
             look_transform, 

From 322b8262fa0ecddf218912ca37e8c6d981ef7d95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Mon, 5 Dec 2022 23:10:29 -0500
Subject: [PATCH 39/81] Adsk contrib - Configure the OpenColorIO.pc file on
 Windows (#1720)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Configure the OpenColorIO.pc file on Windows and fix an issue where CMAKE_INSTALL_PREFIX wasn't included in the cmake build command in ocio.bat.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing pkconfig folder since that PC file is not used.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Small tweak in the configuration of OpenColorIO.cmake.in to handle absolute path with CMAKE_INSTALL_LIBDIR or CMAKE_INSTALL_INCLUDEDIR.
Keeping exec_prefix for CMAKE_INSTALL_INCLUDE_DIR since it was changed for a specific issue on Mac (see PR #1120).

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Using ${prefix} for includedir

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/dev/windows/ocio.bat                  |  3 ++-
 src/OpenColorIO/CMakeLists.txt              | 26 +++++++++++++++------
 src/OpenColorIO/pkgconfig/OpenColorIO.pc.in | 11 ---------
 3 files changed, 21 insertions(+), 19 deletions(-)
 delete mode 100644 src/OpenColorIO/pkgconfig/OpenColorIO.pc.in

diff --git a/share/dev/windows/ocio.bat b/share/dev/windows/ocio.bat
index 04e61b7096..7f24bc279b 100644
--- a/share/dev/windows/ocio.bat
+++ b/share/dev/windows/ocio.bat
@@ -193,6 +193,7 @@ IF NOT EXIST "!PYTHON_PATH!" (
 if !DO_CONFIGURE!==1 (
     echo Running CMake...
     cmake -B "!BUILD_PATH!"^
+        -DCMAKE_INSTALL_PREFIX=!INSTALL_PATH!^
         -DOCIO_INSTALL_EXT_PACKAGES=ALL^
         -DCMAKE_BUILD_TYPE=!CMAKE_BUILD_TYPE!^
         -DGLEW_ROOT="!GLEW_ROOT!"^
@@ -231,7 +232,7 @@ if Not "%CMAKE_CONFIGURE_STATUS%"=="Failed" (
 rem Run cmake --install only if cmake --build was successful.
 if Not "%CMAKE_BUILD_STATUS%"=="Failed" (
     rem Install OCIO
-    cmake --install !BUILD_PATH! --config !CMAKE_BUILD_TYPE! --prefix !INSTALL_PATH!
+    cmake --install !BUILD_PATH! --config !CMAKE_BUILD_TYPE!
     if not ErrorLevel 1 (
         set CMAKE_INSTALL_STATUS=Ok
     ) else (
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 2bc379e118..b667a6d1fa 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -174,18 +174,30 @@ set(SOURCES
     SystemMonitor.cpp
 )
 
-if(NOT WIN32)
+# Install the pkg-config file.
 
-    # Install the pkg-config file.
+set(prefix ${CMAKE_INSTALL_PREFIX})
+set(exec_prefix "\${prefix}")
 
-    set(prefix ${CMAKE_INSTALL_PREFIX})
-    set(exec_prefix "\${prefix}")
+# CMAKE_INSTALL_LIBDIR is not guaranteed to be relative.
+# Not using cmake_path function since it is only available from CMake ≥ 3.20.
+if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
+    set(libdir "${CMAKE_INSTALL_LIBDIR}")
+else()
     set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
-    set(includedir "\${exec_prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
-    configure_file(res/OpenColorIO.pc.in ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc @ONLY)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
 endif()
 
+# CMAKE_INSTALL_INCLUDEDIR is not guaranteed to be relative.
+# Not using cmake_path function since it is only available from CMake ≥ 3.20.
+if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+    set(includedir "${CMAKE_INSTALL_INCLUDEDIR}")
+else()
+    set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
+
+configure_file(res/OpenColorIO.pc.in ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+
 add_library(OpenColorIO ${SOURCES})
 
 # Require at least a C++11 compatible compiler for consumer projects.
diff --git a/src/OpenColorIO/pkgconfig/OpenColorIO.pc.in b/src/OpenColorIO/pkgconfig/OpenColorIO.pc.in
deleted file mode 100644
index ed1d5c4549..0000000000
--- a/src/OpenColorIO/pkgconfig/OpenColorIO.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-
-libdir=@CMAKE_INSTALL_FULL_LIBDIR@
-includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
-
-Name: OpenColorIO
-Description: A color management framework for visual effects and animation
-Version: @OCIO_VERSION_FULL_STR@
-Cflags: -I${includedir}
-Libs: -L${libdir} -lOpenColorIO

From c60f65e1aa3c4ef7f170a3bd8e71de3c63a23ff6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Tue, 6 Dec 2022 00:32:32 -0500
Subject: [PATCH 40/81] Adsk contrib - Hiding minizip-ng symbols on Mac (#1729)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Hiding minizip-ng symbols on Mac

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fixing a linking issue related to the code that tries to hide expat symbols and small refactor to add robustness.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/CMakeLists.txt | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index b667a6d1fa..1c4d774ddb 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -368,9 +368,36 @@ if(UNIX AND NOT APPLE)
     set_property (TARGET OpenColorIO
                   APPEND PROPERTY LINK_FLAGS "-Wl,--exclude-libs,ALL")
 elseif(APPLE)
-    # Hide the expat symbols.
-    set_property (TARGET OpenColorIO
-                  APPEND PROPERTY LINK_FLAGS "-Wl,-hidden-lexpat")
+    if (expat_LIBRARY)
+        get_filename_component(_expat_LIBDIR "${expat_LIBRARY}" DIRECTORY)
+        # Add the path to CMake list of search paths for libraries.
+        list(APPEND _OCIO_LINK_FLAGS_LIST_ "-Wl,-L${_expat_LIBDIR}")
+        # Hide the expat symbols.
+        list(APPEND _OCIO_LINK_FLAGS_LIST_ "-Wl,-hidden-lexpat")
+    endif()
+
+
+    # Check for minizip first since our Findminizip module sets minizip-ng_LIBRARY.
+    if (minizip_LIBRARY)
+        get_filename_component(_minizip-ng_LIBDIR "${minizip_LIBRARY}" DIRECTORY)
+        set(_minizip-ng_NAME "minizip")
+    elseif(minizip-ng_LIBRARY)
+        get_filename_component(_minizip-ng_LIBDIR "${minizip-ng_LIBRARY}" DIRECTORY)
+        set(_minizip-ng_NAME "minizip-ng")        
+    endif()
+
+    if (_minizip-ng_LIBDIR)
+        # Add the path to CMake list of search paths for libraries.
+        list(APPEND _OCIO_LINK_FLAGS_LIST_ "-Wl,-L${_minizip-ng_LIBDIR}")
+        # Hide the minizip-ng symbols.
+        list(APPEND _OCIO_LINK_FLAGS_LIST_ "-Wl,-hidden-l${_minizip-ng_NAME}") 
+    endif()   
+
+    if (_OCIO_LINK_FLAGS_LIST_)
+        list(JOIN _OCIO_LINK_FLAGS_LIST_ " " _OCIO_LINK_FLAGS_LIST_)
+        set_property (TARGET OpenColorIO
+                    APPEND PROPERTY LINK_FLAGS "${_OCIO_LINK_FLAGS_LIST_}")   
+    endif()              
 endif()
 
 if(MSVC AND BUILD_SHARED_LIBS)

From b6a1e21d4297f258ef346f4d4909f18abe3092ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Thu, 22 Dec 2022 20:48:34 -0500
Subject: [PATCH 41/81] Adsk contrib - Fix issue with is colorspace linear
 (#1734)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Throw when the colorspace is undefined for isColorSpaceLinear method. (+ unit test)

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Typo

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/Config.cpp     |  7 +++++++
 tests/cpu/ColorSpace_tests.cpp | 13 +++++++++++++
 tests/python/ColorSpaceTest.py |  6 ++++++
 3 files changed, 26 insertions(+)

diff --git a/src/OpenColorIO/Config.cpp b/src/OpenColorIO/Config.cpp
index bdf9994756..665d522390 100644
--- a/src/OpenColorIO/Config.cpp
+++ b/src/OpenColorIO/Config.cpp
@@ -3127,6 +3127,13 @@ bool Config::isColorSpaceLinear(const char * colorSpace, ReferenceSpaceType refe
 {
     auto cs = getColorSpace(colorSpace);
 
+    if (cs == nullptr)
+    {
+        std::ostringstream os;
+        os << "Could not test colorspace linearity. Colorspace " << colorSpace << " does not exist.";
+        throw Exception(os.str().c_str()); 
+    }
+
     if (cs->isData())
     {
         return false;
diff --git a/tests/cpu/ColorSpace_tests.cpp b/tests/cpu/ColorSpace_tests.cpp
index fc3a6581a6..db959703c7 100644
--- a/tests/cpu/ColorSpace_tests.cpp
+++ b/tests/cpu/ColorSpace_tests.cpp
@@ -898,6 +898,19 @@ inactive_colorspaces: [display_linear-trans, scene_linear-trans]
         OCIO_CHECK_EQUAL_FROM(isLinearToDisplayReference, bDisplayExpected, line);
     };
 
+    // Test undefined color spaces.
+    {
+      OCIO_CHECK_THROW_WHAT(
+        config->isColorSpaceLinear("colorspace_abc", OCIO::REFERENCE_SPACE_SCENE), OCIO::Exception,
+        "Could not test colorspace linearity. Colorspace colorspace_abc does not exist"
+      );
+
+      OCIO_CHECK_THROW_WHAT(
+        config->isColorSpaceLinear("colorspace_abc", OCIO::REFERENCE_SPACE_DISPLAY), OCIO::Exception,
+        "Could not test colorspace linearity. Colorspace colorspace_abc does not exist"
+      );
+    }
+
     {
         testSceneReferred("display_data", false, __LINE__);
         testSceneReferred("display_linear-enc", false, __LINE__);
diff --git a/tests/python/ColorSpaceTest.py b/tests/python/ColorSpaceTest.py
index 792bcf5fec..465539886c 100644
--- a/tests/python/ColorSpaceTest.py
+++ b/tests/python/ColorSpaceTest.py
@@ -579,6 +579,12 @@ def test_display_referred(self, cfg, cs_name, expected_value):
             )
             self.assertEqual(is_linear_to_display_reference, expected_value)
 
+        # Test undefined color spaces.
+        with self.assertRaises(OCIO.Exception):
+            cfg.isColorSpaceLinear('colorspace_abc', OCIO.REFERENCE_SPACE_SCENE)
+        with self.assertRaises(OCIO.Exception):
+            cfg.isColorSpaceLinear('colorspace_abc', OCIO.REFERENCE_SPACE_DISPLAY)
+
         # Test the scene referred color spaces.
         test_scene_referred(self, cfg, "display_data", False)
         test_scene_referred(self, cfg, "display_linear-enc", False)

From c6c3902eab55de699773c8d44723daf947a657e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Tue, 3 Jan 2023 23:40:07 -0500
Subject: [PATCH 42/81] Adsk Contrib - Improve naming of ICC-based displays on
 Windows (#1742)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Adding a method to get the Monitor's userFriendlyName if available

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing the slashes at the start of the display name

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding descriptions and comments.
Adding troubleshooting script that print the monitor display name and ICC profile path.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* no message

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Comments

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .../troubleshooting/print_system_monitors.py  |  21 ++++
 src/OpenColorIO/SystemMonitor_windows.cpp     | 106 ++++++++++++++++--
 2 files changed, 120 insertions(+), 7 deletions(-)
 create mode 100644 share/troubleshooting/print_system_monitors.py

diff --git a/share/troubleshooting/print_system_monitors.py b/share/troubleshooting/print_system_monitors.py
new file mode 100644
index 0000000000..4c3932e85c
--- /dev/null
+++ b/share/troubleshooting/print_system_monitors.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+# 
+# This script will print the information OCIO has for each active monitor/display 
+# connected to the system. The information consists of a Monitor Name and a path 
+# to the monitor's ICC profile. 
+# 
+# OCIO attempts to build the unique Monitor Name based on information available 
+# from the operating system, but sometimes that information may not be that helpful. 
+# Ideally the Monitor Names should be descriptive enough to allow a user to determine 
+# which name corresponds to which physical display and yet brief enough that they are 
+# able to be used in menus that list the available displays. 
+# 
+# This script is an easy way to check the information OCIO detects on a given system 
+# and may be useful when submitting bug reports.
+#
+import PyOpenColorIO as OCIO
+
+for m in OCIO.SystemMonitors().getMonitors():
+    # Each element is a tuple containing the monitor display name and the ICC profile path.
+    print(m)
\ No newline at end of file
diff --git a/src/OpenColorIO/SystemMonitor_windows.cpp b/src/OpenColorIO/SystemMonitor_windows.cpp
index d981fbe7b4..4b8d71584b 100644
--- a/src/OpenColorIO/SystemMonitor_windows.cpp
+++ b/src/OpenColorIO/SystemMonitor_windows.cpp
@@ -13,7 +13,7 @@
 #include <Logging.h>
 
 #include "Platform.h"
-
+#include "utils/StringUtils.h"
 
 namespace OCIO_NAMESPACE
 {
@@ -21,11 +21,87 @@ namespace OCIO_NAMESPACE
 
 static constexpr char ErrorMsg[] { "Problem obtaining monitor profile information from operating system." };
 
+// List all active display paths using QueryDisplayConfig and GetDisplayConfigBufferSizes.
+// Get the data from each path using DisplayConfigGetDeviceInfo.
+void getAllMonitorsWithQueryDisplayConfig(std::vector<std::tstring> & monitorsName)
+{
+    // https://learn.microsoft.com/en-us/windows/win32/api/wingdi/ns-wingdi-displayconfig_path_info
+    std::vector<DISPLAYCONFIG_PATH_INFO> paths;
+    // https://learn.microsoft.com/en-us/windows/win32/api/wingdi/ns-wingdi-displayconfig_mode_info
+    std::vector<DISPLAYCONFIG_MODE_INFO> modes;
+
+    UINT32 flags = QDC_ONLY_ACTIVE_PATHS | QDC_VIRTUAL_MODE_AWARE;
+    LONG result = ERROR_SUCCESS;
 
+    do
+    {
+        // Determine how many path and mode structures to allocate.
+        UINT32 pathCount, modeCount;
+        // The GetDisplayConfigBufferSizes function retrieves the size of the buffers that are 
+        // required to call the QueryDisplayConfig function.
+        result = GetDisplayConfigBufferSizes(flags, &pathCount, &modeCount);
+        
+        // Allocate the path and mode arrays.
+        paths.resize(pathCount);
+        modes.resize(modeCount);
+
+        // The QueryDisplayConfig function retrieves information about all possible display paths 
+        // for all display devices, or views, in the current setting.
+        result = QueryDisplayConfig(flags, &pathCount, paths.data(), &modeCount, modes.data(), nullptr);
+
+        // The function may have returned fewer paths/modes than estimated.
+        paths.resize(pathCount);
+        modes.resize(modeCount);
+
+        // It's possible that between the call to GetDisplayConfigBufferSizes and QueryDisplayConfig
+        // that the display state changed, so loop on the case of ERROR_INSUFFICIENT_BUFFER.
+    } while (result == ERROR_INSUFFICIENT_BUFFER);
+
+    if (result == ERROR_SUCCESS)
+    {
+        // For each active path
+        for (auto& path : paths)
+        {
+            // The DISPLAYCONFIG_TARGET_DEVICE_NAME structure contains information about the target.
+            // Find the target (monitor) friendly name
+            DISPLAYCONFIG_TARGET_DEVICE_NAME targetName = {};
+            targetName.header.adapterId = path.targetInfo.adapterId;
+            targetName.header.id = path.targetInfo.id;
+            targetName.header.type = DISPLAYCONFIG_DEVICE_INFO_GET_TARGET_NAME;
+            targetName.header.size = sizeof(targetName);
+            result = DisplayConfigGetDeviceInfo(&targetName.header);
+
+            if (result == ERROR_SUCCESS)
+            {
+                monitorsName.push_back(
+                    (result == ERROR_SUCCESS && targetName.flags.friendlyNameFromEdid) ? 
+                    targetName.monitorFriendlyDeviceName : L""
+                );
+            }
+        }
+    }
+
+}
+
+/**
+ * Populate the internal structure with monitors name and ICC profiles name.
+ * 
+ * Expected monitor display name: 
+ * 
+ * DISPLAYn, <monitorFriendlyDeviceName | DeviceString>
+ * 
+ * where n is a positive integer starting at 1.
+ * where monitorFriendlyDeviceName comes from DISPLAYCONFIG_TARGET_DEVICE_NAME structure.
+ * where DeviceString comes from DISPLAY_DEVICE structure.
+ * 
+ */
 void SystemMonitorsImpl::getAllMonitors()
 {
     m_monitors.clear();
 
+    std::vector<std::tstring> friendlyMonitorNames;
+    getAllMonitorsWithQueryDisplayConfig(friendlyMonitorNames);
+
     // Initialize the structure.
     DISPLAY_DEVICE dispDevice;
     ZeroMemory(&dispDevice, sizeof(dispDevice));
@@ -33,6 +109,7 @@ void SystemMonitorsImpl::getAllMonitors()
 
     // Iterate over all the monitors.
     DWORD dispNum = 0;
+    // After the first call to EnumDisplayDevices, dispDevice.DeviceString is the adapter name.
     while (EnumDisplayDevices(nullptr, dispNum, &dispDevice, 0))
     {
         const std::tstring deviceName = dispDevice.DeviceName;
@@ -49,9 +126,10 @@ void SystemMonitorsImpl::getAllMonitors()
                 ZeroMemory(&dispDevice, sizeof(dispDevice));
                 dispDevice.cb = sizeof(dispDevice);
 
-                // After a second call, dispDev.DeviceString contains the  
-                // monitor name for that device.
-                EnumDisplayDevices(deviceName.c_str(), dispNum, &dispDevice, 0);   
+                // After second call, dispDevice.DeviceString is the monitor name for that device.
+                // Second parameters must be 0 to get the monitor name.
+                // See https://learn.microsoft.com/en-us/windows/win32/api/winuser/nf-winuser-enumdisplaydevicesw
+                EnumDisplayDevices(deviceName.c_str(), 0, &dispDevice, 0);   
 
                 TCHAR icmPath[MAX_PATH + 1];
                 DWORD pathLength = MAX_PATH;
@@ -60,8 +138,23 @@ void SystemMonitorsImpl::getAllMonitors()
 
                 // TODO: Several ICM profiles could be associated to a single device.
 
-                const std::tstring displayName
-                    = deviceName + TEXT(", ") + dispDevice.DeviceString;
+                bool idxExists = friendlyMonitorNames.size() >= dispNum+1;
+                bool friendlyNameExists = idxExists && !friendlyMonitorNames.at(dispNum).empty();
+
+                // Check if the distNum index exists in friendlyMonitorNames vector and check if
+                // there is a corresponding friendly name.
+                const std::tstring extra = friendlyNameExists ? 
+                        friendlyMonitorNames.at(dispNum) : std::tstring(dispDevice.DeviceString);
+
+                std::tstring strippedDeviceName = deviceName;
+                if(StringUtils::StartsWith(Platform::Utf16ToUtf8(deviceName), "\\\\.\\DISPLAY"))
+                {
+                    // Remove the slashes.
+                    std::string prefix = "\\\\.\\";
+                    strippedDeviceName = deviceName.substr(prefix.length());
+                }
+                        
+                const std::tstring displayName = strippedDeviceName + TEXT(", ") + extra;
 
                 // Get the associated ICM profile path.
                 if (GetICMProfile(hDC, &pathLength, icmPath))
@@ -98,5 +191,4 @@ void SystemMonitorsImpl::getAllMonitors()
     }
 }
 
-
 } // namespace OCIO_NAMESPACE

From 790856f03569b52179eab0fe02f9b7c1915c190a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Achard?= <remiachard@gmail.com>
Date: Wed, 4 Jan 2023 21:14:21 +0100
Subject: [PATCH 43/81] Fix minizip-ng CMake args passing (#1741)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Achard <remiachard@gmail.com>

Signed-off-by: Rémi Achard <remiachard@gmail.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/cmake/modules/Findminizip-ng.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/share/cmake/modules/Findminizip-ng.cmake b/share/cmake/modules/Findminizip-ng.cmake
index b0136abc5f..0915ad8b18 100644
--- a/share/cmake/modules/Findminizip-ng.cmake
+++ b/share/cmake/modules/Findminizip-ng.cmake
@@ -193,8 +193,8 @@ if(NOT minizip_FOUND AND NOT TARGET MINIZIP::minizip)
             "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_minizip-ng_LIB_PREFIX}minizip-ng${_minizip-ng_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
 
         if(_minizip-ng_TARGET_CREATE)
-            set(MINIZIP-NG_CMAKE_ARGS
-                ${MINIZIP-NG_CMAKE_ARGS}
+            set(minizip-ng_CMAKE_ARGS
+                ${minizip-ng_CMAKE_ARGS}
                 -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
                 -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
                 -DCMAKE_POSITION_INDEPENDENT_CODE=ON
@@ -265,7 +265,7 @@ if(NOT minizip_FOUND AND NOT TARGET MINIZIP::minizip)
             GIT_SHALLOW TRUE
             PREFIX "${_EXT_BUILD_ROOT}/libminizip-ng"
             BUILD_BYPRODUCTS ${minizip-ng_LIBRARY}
-            CMAKE_ARGS ${MINIZIP-NG_CMAKE_ARGS}
+            CMAKE_ARGS ${minizip-ng_CMAKE_ARGS}
             EXCLUDE_FROM_ALL TRUE
             BUILD_COMMAND ""
             INSTALL_COMMAND

From cd7ed8221920b1ad104d824847da042dfadd8ae9 Mon Sep 17 00:00:00 2001
From: Doug Walker <doug.walker@autodesk.com>
Date: Wed, 4 Jan 2023 16:00:36 -0500
Subject: [PATCH 44/81] Fix inverse Lut1D optimization bug (#1743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

Signed-off-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/OpOptimizers.cpp          | 34 +++++++++-
 src/OpenColorIO/ops/lut1d/Lut1DOpData.cpp | 80 +++++++++++++++++++++++
 src/OpenColorIO/ops/lut1d/Lut1DOpData.h   |  2 +
 tests/cpu/OpOptimizers_tests.cpp          | 57 ++++++++++++++++
 4 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/src/OpenColorIO/OpOptimizers.cpp b/src/OpenColorIO/OpOptimizers.cpp
index 8b7e2c2fc7..7788bfb164 100755
--- a/src/OpenColorIO/OpOptimizers.cpp
+++ b/src/OpenColorIO/OpOptimizers.cpp
@@ -12,6 +12,7 @@
 #include "Op.h"
 #include "ops/lut1d/Lut1DOp.h"
 #include "ops/lut3d/Lut3DOp.h"
+#include "ops/range/RangeOp.h"
 
 namespace OCIO_NAMESPACE
 {
@@ -241,7 +242,38 @@ int RemoveInverseOps(OpRcPtrVec & opVec, OptimizationFlags oFlags)
             // When a pair of inverse ops is removed, we want the optimized ops to give the
             // same result as the original.  For certain ops such as Lut1D or Log this may
             // mean inserting a Range to emulate the clamping done by the original ops.
-            auto replacedBy = op1->getIdentityReplacement();
+
+            OpRcPtr replacedBy;
+            if (type1 == OpData::Lut1DType)
+            {
+                // Lut1D gets special handling so that both halfs of the pair are available.
+                // Only the inverse LUT has the values needed to generate the replacement.
+
+                ConstLut1DOpDataRcPtr lut1 = OCIO_DYNAMIC_POINTER_CAST<const Lut1DOpData>(op1->data());
+                ConstLut1DOpDataRcPtr lut2 = OCIO_DYNAMIC_POINTER_CAST<const Lut1DOpData>(op2->data());
+
+                OpDataRcPtr opData = lut1->getPairIdentityReplacement(lut2);
+
+                OpRcPtrVec ops;
+                if (opData->getType() == OpData::MatrixType)
+                {
+                    // No-op that will be optimized.
+                    auto mat = OCIO_DYNAMIC_POINTER_CAST<MatrixOpData>(opData);
+                    CreateMatrixOp(ops, mat, TRANSFORM_DIR_FORWARD);
+                }
+                else if (opData->getType() == OpData::RangeType)
+                {
+                    // Clamping op.
+                    auto range = OCIO_DYNAMIC_POINTER_CAST<RangeOpData>(opData);
+                    CreateRangeOp(ops, range, TRANSFORM_DIR_FORWARD);
+                }
+                replacedBy = ops[0];
+            }
+            else
+            {
+                replacedBy = op1->getIdentityReplacement();
+            }
+
             replacedBy->finalize();
             if (replacedBy->isNoOp())
             {
diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpData.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpData.cpp
index 899981615a..a12a7a3e1f 100644
--- a/src/OpenColorIO/ops/lut1d/Lut1DOpData.cpp
+++ b/src/OpenColorIO/ops/lut1d/Lut1DOpData.cpp
@@ -279,6 +279,86 @@ OpDataRcPtr Lut1DOpData::getIdentityReplacement() const
     return res;
 }
 
+OpDataRcPtr Lut1DOpData::getPairIdentityReplacement(ConstLut1DOpDataRcPtr & lut2) const
+{
+    OpDataRcPtr res;
+    if (isInputHalfDomain())
+    {
+        // TODO: If a half-domain LUT has a flat spot, it would be more appropriate
+        // to use a Range, since some areas would be clamped in a round-trip.
+        // Currently leaving this a Matrix since it is a potential work-around
+        // for situations where you want a pair identity of LUTs to be totally
+        // removed, even if it omits some clamping at extreme values.
+        res = std::make_shared<MatrixOpData>();
+    }
+    else
+    {
+        // Note that the ops have been finalized by the time this is called,
+        // Therefore, for an inverse Lut1D, it means initializeFromForward() has
+        // been called and so any reversals have been converted to flat regions.
+        // Therefore, the first and last LUT entries are the extreme values and
+        // the ComponentProperties has been initialized, but only for the op
+        // whose direction is INVERSE.
+        const Lut1DOpData * invLut = (m_direction == TRANSFORM_DIR_INVERSE)
+            ? this: lut2.get();
+        const ComponentProperties & redProperties = invLut->getRedProperties();
+        const unsigned long length = invLut->getArray().getLength();
+
+        // If the start or end of the LUT contains a flat region, that will cause
+        // a round-trip to be limited.
+
+        double minValue = 0.;
+        double maxValue = 1.;
+        switch (m_direction)
+        {
+        case TRANSFORM_DIR_FORWARD: // Fwd Lut1D -> Inv Lut1D
+        {
+            // A round-trip in this order will impose at least a clamp to [0,1]
+            // based on what happens entering the first Fwd Lut1D.  However, the
+            // clamping may be to an even narrower range if there are flat regions.
+            //
+            // The flat region limitation is imposed based on the where it falls
+            // relative to the [0,1] input domain.
+
+            // TODO: A RangeOp has one min & max for all channels, whereas a Lut1D may 
+            // have three independent channels.  Potentially could look at all chans
+            // and take the extrema of each?  For now, just using the first channel.
+            const unsigned long minIndex = redProperties.startDomain;
+            const unsigned long maxIndex = redProperties.endDomain;
+
+            minValue = (double)minIndex / (length - 1);
+            maxValue = (double)maxIndex / (length - 1);
+            break;
+        }
+        case TRANSFORM_DIR_INVERSE: // Inv Lut1D -> Fwd Lut1D
+        {
+            // A round-trip in this order will impose a clamp, but it may be to
+            // bounds outside of [0,1] since the Fwd LUT may contain values outside
+            // [0,1] and so the Inv LUT will accept inputs on that extended range.
+            //
+            // The flat region limitation is imposed based on the output range.
+
+            const bool isIncreasing = redProperties.isIncreasing;
+            const unsigned long maxChannels = invLut->getArray().getMaxColorComponents();
+            const unsigned long lastValIndex = (length - 1) * maxChannels;
+            // Note that the array for the invLut has had initializeFromForward()
+            // done and so any reversals have been converted to flat regions and
+            // the extrema are at the beginning & end of the LUT.
+            const Array::Values & lutValues = invLut->getArray().getValues();
+
+            // TODO: Currently only basing this on the red channel.
+            minValue = isIncreasing ? lutValues[0] : lutValues[lastValIndex];
+            maxValue = isIncreasing ? lutValues[lastValIndex] : lutValues[0];
+            break;
+        }
+        }
+
+        res = std::make_shared<RangeOpData>(minValue, maxValue,
+                                            minValue, maxValue);
+    }
+    return res;
+}
+
 void Lut1DOpData::setInputHalfDomain(bool isHalfDomain) noexcept
 {
     m_halfFlags = (isHalfDomain) ?
diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpData.h b/src/OpenColorIO/ops/lut1d/Lut1DOpData.h
index a536b5a460..d06ca8da6d 100644
--- a/src/OpenColorIO/ops/lut1d/Lut1DOpData.h
+++ b/src/OpenColorIO/ops/lut1d/Lut1DOpData.h
@@ -179,6 +179,8 @@ class Lut1DOpData : public OpData
 
     OpDataRcPtr getIdentityReplacement() const override;
 
+    OpDataRcPtr getPairIdentityReplacement(ConstLut1DOpDataRcPtr & lut2) const;
+
     inline const ComponentProperties & getRedProperties() const
     {
         return m_componentProperties[0];
diff --git a/tests/cpu/OpOptimizers_tests.cpp b/tests/cpu/OpOptimizers_tests.cpp
index b3305f4def..6f606fd6e8 100644
--- a/tests/cpu/OpOptimizers_tests.cpp
+++ b/tests/cpu/OpOptimizers_tests.cpp
@@ -645,6 +645,63 @@ OCIO_ADD_TEST(OpOptimizers, lut1d_identity_replacement)
     }
 }
 
+OCIO_ADD_TEST(OpOptimizers, lut1d_identity_replacement_order)
+{
+    // See issue #1737, https://github.com/AcademySoftwareFoundation/OpenColorIO/issues/1737.
+
+    // This CTF contains a single LUT1D, inverse direction, normal (not half) domain.
+    // It contains values from -6 to +3.4.
+    const std::string fileName("lut1d_inverse_gpu.ctf");
+    OCIO::ContextRcPtr context = OCIO::Context::Create();
+
+    OCIO::OpRcPtrVec inv_ops;
+    OCIO_CHECK_NO_THROW(OCIO::BuildOpsTest(inv_ops, fileName, context,
+                                        // FWD direction simply means don't swap the direction, the
+                                        // file contains an inverse LUT1D and leave it that way.
+                                           OCIO::TRANSFORM_DIR_FORWARD));
+    OCIO::OpRcPtrVec fwd_ops;
+    OCIO_CHECK_NO_THROW(OCIO::BuildOpsTest(fwd_ops, fileName, context,
+                                           OCIO::TRANSFORM_DIR_INVERSE));
+
+    // Check forward LUT1D followed by inverse LUT1D.
+    {
+        OCIO::OpRcPtrVec fwd_inv_ops = fwd_ops;
+        fwd_inv_ops += inv_ops;
+
+        OCIO_CHECK_NO_THROW(fwd_inv_ops.finalize());
+        OCIO_CHECK_NO_THROW(fwd_inv_ops.optimize(OCIO::OPTIMIZATION_NONE));
+        OCIO_CHECK_EQUAL(fwd_inv_ops.size(), 2);  // no optmization was done
+
+        OCIO::OpRcPtrVec optOps = fwd_inv_ops.clone();
+        OCIO_CHECK_NO_THROW(optOps.finalize());
+        OCIO_CHECK_NO_THROW(optOps.optimize(OCIO::OPTIMIZATION_DEFAULT));
+        OCIO_CHECK_EQUAL(optOps.size(), 1);
+        OCIO_CHECK_EQUAL(optOps[0]->getInfo(), "<RangeOp>");
+
+        // Compare renders.
+        CompareRender(fwd_inv_ops, optOps, __LINE__, 1e-6f);
+    }
+
+    // Check inverse LUT1D followed by forward LUT1D.
+    {
+        OCIO::OpRcPtrVec inv_fwd_ops = inv_ops;
+        inv_fwd_ops += fwd_ops;
+
+        OCIO_CHECK_NO_THROW(inv_fwd_ops.finalize());
+        OCIO_CHECK_NO_THROW(inv_fwd_ops.optimize(OCIO::OPTIMIZATION_NONE));
+        OCIO_CHECK_EQUAL(inv_fwd_ops.size(), 2);  // no optmization was done
+
+        OCIO::OpRcPtrVec optOps = inv_fwd_ops.clone();
+        OCIO_CHECK_NO_THROW(optOps.finalize());
+        OCIO_CHECK_NO_THROW(optOps.optimize(OCIO::OPTIMIZATION_DEFAULT));
+        OCIO_CHECK_EQUAL(optOps.size(), 1);
+        OCIO_CHECK_EQUAL(optOps[0]->getInfo(), "<RangeOp>");
+
+        // Compare renders.
+        CompareRender(inv_fwd_ops, optOps, __LINE__, 1e-6f);
+    }
+}
+
 OCIO_ADD_TEST(OpOptimizers, lut1d_half_domain_keep_prior_range)
 {
     // A half-domain LUT should not allow removal of a prior range op.

From 9009c339b0a160bd80134e5dfe4ddbecf6f34bd7 Mon Sep 17 00:00:00 2001
From: Doug Walker <doug.walker@autodesk.com>
Date: Wed, 4 Jan 2023 16:44:23 -0500
Subject: [PATCH 45/81] Update documentation for 2.2 release (#1738)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update documentation for 2.2 release

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

* Fix typos

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

* Remove V2_DOC_README.md

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

* Tweaks to 2.2

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

* Improve installation section

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

* Add link to demo config

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

* Fix typos

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

* Improve file_rules section

Signed-off-by: Doug Walker <doug.walker@autodesk.com>

Signed-off-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 COMMITTERS.md                                 |   1 +
 CONTRIBUTING.md                               |  41 +-
 docs/V2_DOC_README.md                         | 106 -----
 docs/concepts/publications/publications.rst   |  27 +-
 docs/configurations/_index.rst                |   6 +-
 docs/configurations/aces_1.0.3.rst            |   8 +-
 docs/configurations/aces_cg.rst               |  42 ++
 docs/configurations/aces_studio.rst           |  41 ++
 docs/configurations/ocio_v2_demo.rst          |   4 +
 docs/guides/authoring/colorspaces.rst         |  18 +-
 docs/guides/authoring/overview.rst            |  19 +-
 docs/guides/authoring/rules.rst               |  57 ++-
 docs/guides/authoring/transforms.rst          |   1 +
 .../contributing/documentation_guidelines.rst | 102 ++++-
 .../contributing/repository_structure.rst     |  28 +-
 .../contributing/submitting_changes.rst       |  10 +-
 docs/guides/using_ocio/tool_overview.rst      |  48 ++-
 docs/index.rst                                |  25 +-
 docs/quick_start/downloads.rst                |  39 +-
 docs/quick_start/for_artists.rst              |   9 +-
 docs/quick_start/for_config_authors.rst       |   9 +-
 docs/quick_start/for_contributors.rst         |   5 +-
 docs/quick_start/for_devs.rst                 |  18 +-
 docs/quick_start/installation.rst             | 374 +++++++++++------
 docs/{upgrading_v2 => releases}/_index.rst    |   8 +-
 .../how_to.rst => releases/ocio_2_0.rst}      |  24 +-
 docs/releases/ocio_2_1.rst                    | 173 ++++++++
 docs/releases/ocio_2_2.rst                    | 395 ++++++++++++++++++
 docs/toc_redirect.rst                         |   2 +-
 include/OpenColorIO/OpenColorIO.h             |  26 +-
 include/OpenColorIO/OpenColorTypes.h          |   5 +-
 31 files changed, 1259 insertions(+), 412 deletions(-)
 delete mode 100644 docs/V2_DOC_README.md
 create mode 100644 docs/configurations/aces_cg.rst
 create mode 100644 docs/configurations/aces_studio.rst
 rename docs/{upgrading_v2 => releases}/_index.rst (74%)
 rename docs/{upgrading_v2/how_to.rst => releases/ocio_2_0.rst} (97%)
 create mode 100644 docs/releases/ocio_2_1.rst
 create mode 100644 docs/releases/ocio_2_2.rst

diff --git a/COMMITTERS.md b/COMMITTERS.md
index e7da40cb39..23ed878e73 100644
--- a/COMMITTERS.md
+++ b/COMMITTERS.md
@@ -20,3 +20,4 @@ The current OpenColorIO Committers are:
 | Doug Walker | @doug-walker |
 | Kevin Wheatley | @KevinJW |
 | Rémi Achard | @remia |
+| Cédrik Fuoco | @cedrik-fuoco-adsk |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 684e09a7a1..0d0a442c4b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,7 +19,7 @@ explains our contribution process and procedures, so please review it first:
 For a description of the roles and responsibilities of the various
 members of the OpenColorIO community, see [GOVERNANCE](GOVERNANCE.md), and
 for further details, see the project's
-[Technical Charter](docs/aswf/Charter.md). Briefly, Contributors are anyone
+[Technical Charter](ASWF/Charter.md). Briefly, Contributors are anyone
 who submits content to the project, Committers review and approve such
 submissions, and the Technical Steering Committee provides general project
 oversight.
@@ -214,18 +214,20 @@ the work.
 a Committer other than the PR contributor may squash and merge changes into the 
 main branch.
 
-See also (from the OCIO Developer Guide):
-* [Getting started](http://opencolorio.org/developers/getting_started.html)
-* [Submitting Changes](http://opencolorio.org/developers/submitting_changes.html)
+For a more detailed description of the contribution process, please see the 
+Contributing Guide in the main OCIO documentation:
+
+* [Getting Started](https://opencolorio.readthedocs.io/en/latest/guides/contributing/contributing.html#getting-started)
+* [Submitting Changes](https://opencolorio.readthedocs.io/en/latest/guides/contributing/contributing.html#submitting-changes)
 
 ## Coding Standards
 
 Please see the OpenColorIO
-[Coding guidelines](http://opencolorio.org/developers/coding_guidelines.html)
+[Coding guidelines](https://opencolorio.readthedocs.io/en/latest/guides/contributing/contributing.html#coding-style-guide)
 for a reference on project code style and best practices.
 
 For standards on contributing to documentation, see the
-[Documentation guidelines](http://opencolorio.org/developers/documentation_guidelines.html).
+[Documentation guidelines](https://opencolorio.readthedocs.io/en/latest/guides/contributing/contributing.html#documentation-guidelines).
 
 ## Test Policy
 
@@ -245,27 +247,8 @@ The test should should be run, via ``ctest``, before submitting a pull request.
 
 ## Versioning Policy
 
-OpenColorIO uses [semantic versioning](https://semver.org), which labels each
-version with three numbers: Major.Minor.Patch, where:
-
-* **MAJOR** indicates incompatible API changes
-* **MINOR** indicates functionality added in a backwards-compatible manner
-* **PATCH** indicates backwards-compatible bug fixes
-
-## Creating a Release
-
-To create a new release from the main branch:
-
-1. Update the release notes in ``CHANGELOG.md`` with a high-level summary of
-   the features and improvements. Also include the summary in the Release
-   comments.
-
-2. Create a new release on the GitHub Releases page.
-
-3. Tag the release with name beginning with '``v``', e.g. '``v2.1.0``'.
-
-4. Download and sign the release tarball, as described
-   [here](https://wiki.debian.org/Creating%20signed%20GitHub%20releases),
+OpenColorIO labels each version with three numbers: Major.Minor.Patch, where:
 
-5. Attach the detached ``.asc`` signature file to the GitHub release as a
-   binary file.
+* **MAJOR** indicates major architectural changes
+* **MINOR** indicates an introduction of significant new features
+* **PATCH** indicates ABI-compatible bug fixes and minor enhancements
diff --git a/docs/V2_DOC_README.md b/docs/V2_DOC_README.md
deleted file mode 100644
index 45bb946eeb..0000000000
--- a/docs/V2_DOC_README.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# OCIO v2 Documentation Effort
-
-Since the API autodoc effort is happening simultaneously it will be easier for
-a lot of contributors to do a light-weight build of the Sphinx docs, which
-doesn't require building OCIO in its entirety.
-
-## How to Build HTML Docs
-
-In addition to the general OCIO v2 dependencies, install Doxygen, Sphinx, and 
-the Python packages listed in the `docs/requirements.txt` file.
-
-Doxygen can be installed with a package manager, or by downloading a binary 
-distribution from [Doxygen's website](https://www.doxygen.nl/download.html). 
-Use one of the following commands to install Doxygen with a platform-specific 
-package manager:
-
-Linux (Debian, etc.):
-
-```
-apt-get install doxygen
-```
-
-Linux (Fedora, etc.):
-
-```
-yum install doxygen
-```
-
-macOS ([Homebrew](https://brew.sh/)):
-
-```
-brew install doxygen
-```
-
-Windows ([Chocolatey](https://chocolatey.org/)):
-
-```
-choco install doxygen.install
-```
-
-Sphinx and the other Python dependencies can be installed with a single `pip` 
-command on all platforms:
-
-```
-pip install -r docs/requirements.txt
-```
-
-To build only the RST docs (excluding API) locally:
-
-```
-cd docs
-mkdir _build
-sphinx-build -b html . _build
-<your web browser name> _build/index.html
-```
-
-To build all HTML docs (including API) locally:
-
-```
-mkdir _build
-cd _build
-cmake ../. -DOCIO_BUILD_DOCS=ON -DCMAKE_INSTALL_PREFIX=../_install
-make -j<your thread count> docs
-<your web browser name> docs/build-html/index.html
-```
-
-Python 3 is required to build the documentation. If you have multiple Python
-installs you'll need to make sure pip and CMake find the correct version. You 
-can manually inform cmake of which to use by adding this option to the above 
-`cmake` command, which configures the documentation build:
-
-```
--DPython_ROOT=<Path to Python 3 root directory>
-```
-
-To ease the documentation build and test on macOS one could also add these 
-three lines in the ~/.zshrc file:
-
-```
-alias python=/usr/local/bin/python3
-alias pip=/usr/local/bin/pip3
-alias <your web browser name>=open -a "<your web browser application name>"
-```
-
-## Quirks
-
-The vuepress theme that we've migrated to has some quirks to its design. For
-example it only allows two nested table of contents (TOC). So things have to be
-organized in a slightly different way than other sphinx projects.
-
-The root-level `toc_redirect.rst` points to where to find the different section
-TOCs. The name and contents of each sections TOC is defined in that
-sub-directory's `_index.rst` file.
-
-In this TOC the `:caption:` directive determines what the name of the section
-will be in the side-bar, and in the header of the website. The *H1* header
-determines the name of the page in the right/left arrows navigation bar. In a
-lot of cases this ends up doubling up the name on the page, but this seems
-unavoidable at the present time. If additional explanatory text is put in the
-`_index.rst` files then it shouldn't be as problematic.
-
-The site will show all *H1* headers in the side panel by default, these then
-expand when selected to show all *H2* headers.
-
-Due to the limited TOC and side-bar depth, we shouldn't be afraid of looong
-pages with many *H2* headings to break down the page into logical quadrants.
diff --git a/docs/concepts/publications/publications.rst b/docs/concepts/publications/publications.rst
index fe64c6f2f8..dfa8fcf778 100644
--- a/docs/concepts/publications/publications.rst
+++ b/docs/concepts/publications/publications.rst
@@ -4,11 +4,28 @@
 
 .. _publications:
 
-Publications
-============
+Presentations & Publications
+============================
 
-DigiPro 2020 **paper** `"The ASWF takes OpenColorIO to the Next Level" <https://drive.google.com/file/d/1y_0ZEftivHH0zoKwvKZx3gH_u_cDm1bD/view?usp=sharing>`_
+* OCIO wins the Pipeline Tool award at DigiPro 2022! `video <https://vimeo.com/742052701>`_
 
-DigiPro 2020 **video** `"The ASWF takes OpenColorIO to the Next Level" <https://vimeo.com/458011669>`_
+* ASWF Open Source Days 2022 `video <https://www.youtube.com/watch?v=WzFlz1HeNdI>`_
 
-`Cinematic Color <http://cinematiccolor.org/>`_
+* HPA Tech Retreat 2022 "Color Processing with OCIO v2 and the Academy/ASC Common LUT Format"
+  `Slides & Tutorial <https://drive.google.com/drive/folders/1ZRTIYE6kFb-7sTUtYIYFNJndhY6tmJee?usp=share_link>`_
+
+* OCIO v2 wins an Engineering Excellence award from the Hollywood Post Alliance! 
+  `article <https://www.hollywoodreporter.com/movies/movie-news/hpa-engineering-excellence-award-recipients-announced-1234995410/>`_
+
+* SIGGRAPH 2021 course "Color management with OpenColorIO V2" 
+  `video <https://vimeo.com/689093714>`__  `PDF <https://drive.google.com/file/d/1v37Bz7s1wbJNg-ULsBqnGPA9h7pWNfhU/view?usp=share_link>`__  `ACM <https://dl.acm.org/doi/10.1145/3450508.3464600>`_
+
+* ASWF Open Source Days 2021 `video <https://youtu.be/FSzLwSTJjWo>`_
+
+* DigiPro 2020 "The ASWF takes OpenColorIO to the Next Level" `video <https://vimeo.com/458011669>`__  `PDF <https://drive.google.com/file/d/1y_0ZEftivHH0zoKwvKZx3gH_u_cDm1bD/view?usp=sharing>`__  `ACM <https://dl.acm.org/doi/abs/10.1145/3403736.3403942>`_
+
+* ASWF Open Source Days 2020 `video <https://www.youtube.com/watch?v=7e0SSka8Ar8>`_
+
+* ASWF Open Source Days 2019 `video <https://youtu.be/L5dpFtgZuhQ>`_
+
+* `Cinematic Color <http://cinematiccolor.org/>`_
diff --git a/docs/configurations/_index.rst b/docs/configurations/_index.rst
index 2b7556688e..734cf350bd 100644
--- a/docs/configurations/_index.rst
+++ b/docs/configurations/_index.rst
@@ -10,8 +10,10 @@ Configurations
 .. toctree::
    :caption: Configurations
 
+   aces_studio
+   aces_cg
+   ocio_v2_demo
    aces_1.0.3
-   nuke_default
    spi_anim
    spi_vfx
-   ocio_v2_demo
+   nuke_default
diff --git a/docs/configurations/aces_1.0.3.rst b/docs/configurations/aces_1.0.3.rst
index b8ae17d946..ace8b95a0a 100644
--- a/docs/configurations/aces_1.0.3.rst
+++ b/docs/configurations/aces_1.0.3.rst
@@ -2,17 +2,21 @@
   SPDX-License-Identifier: CC-BY-4.0
   Copyright Contributors to the OpenColorIO Project.
 
+.. _aces_1.0.3:
+
 aces_1.0.3
 ==========
 
 This section describes the ACES 1.0.3 OpenColorIO configuration.
 
 .. note::
-    There is a more recent version of the ACES config for ACES 1.2 available here:
+    There is a more recent version of the ACES 1.0.3 config for ACES 1.2 available here:
 
     - https://github.com/colour-science/OpenColorIO-Configs/tree/feature/aces-1.2-config
 
-    An ACES config that takes advantage of the new OCIO v2 features is under development.
+.. note::
+    Please note that both of these configurations have been supplanted by the new 
+    :ref:`aces_studio` for OCIO v2.
 
 
 Information about ACES
diff --git a/docs/configurations/aces_cg.rst b/docs/configurations/aces_cg.rst
new file mode 100644
index 0000000000..d134b62b44
--- /dev/null
+++ b/docs/configurations/aces_cg.rst
@@ -0,0 +1,42 @@
+..
+  SPDX-License-Identifier: CC-BY-4.0
+  Copyright Contributors to the OpenColorIO Project.
+
+.. _aces_cg:
+
+ACES CG Config
+==============
+
+The ACES Computer Graphics (CG) config is a simple, lightweight config intended for use
+in typical digital content creation (DCC) apps that need robust choices for texture and
+rendering spaces and a basic selection of display and view transforms.
+
+Users who need a more extensive set of color spaces, including digital cinema camera
+color spaces and a wider set of displays and view should look at the :ref:`aces_studio`.
+
+The latest version of this config may be downloaded from the Releases page of its GitHub
+`repo. <https://github.com/AcademySoftwareFoundation/OpenColorIO-Config-ACES/releases>`_
+
+The ACES CG Config leverages the high quality ACES implementation built into OCIO itself
+and so requires no external LUT files.  In fact, even the config file is built into OCIO
+and users may access it from any application that uses OCIO 2.2 or higher by using the
+following string in place of the config path::
+    ocio://cg-config-v1.0.0_aces-v1.3_ocio-v2.1
+
+The default built-in config is currently the ACES CG Config, so the even simpler:
+``ocio://default`` may be used instead.  Note however, that the value of the default
+config may evolve over time.
+
+The OCIO Configs Working Group collected input from the community and simplified the
+naming scheme relative to the earlier OCIO v1 ACES configs.  However, aliases have been 
+added so that the original color space names continue to work (if there is an equivalent
+space in the new config).
+
+Please note that with OCIO v2 we are trying to be more rigorous about what constitutes a 
+"color space". For this reason, the new configs do not bake view transforms or looks into 
+the display color spaces.  Therefore, it is necessary to use a DisplayViewTransform rather 
+than a ColorSpaceTransform if you want to bake in an ACES Output Transform.  This is not 
+only more rigorous from a color management point of view, it also helps clarify to end-users 
+the important role of a view transform in the process.  Baking in a view transform is a 
+fundamentally different process than just converting between color space encodings, and it 
+should be perceived as such by users.
diff --git a/docs/configurations/aces_studio.rst b/docs/configurations/aces_studio.rst
new file mode 100644
index 0000000000..0f8132fc58
--- /dev/null
+++ b/docs/configurations/aces_studio.rst
@@ -0,0 +1,41 @@
+..
+  SPDX-License-Identifier: CC-BY-4.0
+  Copyright Contributors to the OpenColorIO Project.
+
+.. _aces_studio:
+
+ACES Studio Config
+==================
+
+The ACES Studio Config is the successor to the widely used :ref:`ACES config <aces_1.0.3>`
+for OCIO v1.
+
+It contains the complete set of ACES color spaces, displays, and views.  In addition, it
+contains some extra color spaces that are widely used in the VFX and post-production 
+industries.
+
+Users who need a simpler config that contains just the basics needed to use ACES color
+management in common DCC tools are encouraged to check out the :ref:`aces_cg`.
+
+The latest version of this config may be downloaded from the Releases page of its GitHub
+`repo. <https://github.com/AcademySoftwareFoundation/OpenColorIO-Config-ACES/releases>`_
+
+The ACES Studio Config leverages the high quality ACES implementation built into OCIO itself
+and so requires no external LUT files.  In fact, even the config file is built into OCIO
+and users may access it from any application that uses OCIO 2.2 or higher by using the
+following string in place of the config path::
+    ocio://studio-config-v1.0.0_aces-v1.3_ocio-v2.1
+
+The OCIO Configs Working Group collected input from the community and simplified the
+naming scheme relative to the earlier OCIO v1 ACES configs.  However, aliases have been 
+added so that the original color space names continue to work (if there is an equivalent
+space in the new config).
+
+Please note that with OCIO v2 we are trying to be more rigorous about what constitutes a 
+"color space". For this reason, the new configs do not bake view transforms or looks into 
+the display color spaces.  Therefore, it is necessary to use a DisplayViewTransform rather 
+than a ColorSpaceTransform if you want to bake in an ACES Output Transform.  This is not 
+only more rigorous from a color management point of view, it also helps clarify to end-users 
+the important role of a view transform in the process.  Baking in a view transform is a 
+fundamentally different process than just converting between color space encodings, and it 
+should be perceived as such by users.
diff --git a/docs/configurations/ocio_v2_demo.rst b/docs/configurations/ocio_v2_demo.rst
index 22c072c078..52f098f19a 100644
--- a/docs/configurations/ocio_v2_demo.rst
+++ b/docs/configurations/ocio_v2_demo.rst
@@ -10,6 +10,10 @@ ocio-v2_demo
 Note: this is not intended to be a complete production-ready config, its purpose
 is to introduce many of the new features in OCIO v2.
 
+Due to the limitations of the web template, it may be easier for you to read this
+config in a text editor.  The config file is located 
+`here. <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/docs/configurations/ocio-v2_demo.ocio>`_ 
+
 
 .. literalinclude:: ocio-v2_demo.ocio
    :language: yaml
diff --git a/docs/guides/authoring/colorspaces.rst b/docs/guides/authoring/colorspaces.rst
index 27684f1515..b5cfce0239 100644
--- a/docs/guides/authoring/colorspaces.rst
+++ b/docs/guides/authoring/colorspaces.rst
@@ -15,19 +15,19 @@ Colorspaces
 
 Required.
 
-This section is a list of all the colorspaces known to OCIO. A
-colorspace can be referred to elsewhere within the config (including
+This section is a list of the scene-referred colorspaces in the config.
+A colorspace may be referred to elsewhere within the config (including
 other colorspace definitions), and are used within OCIO-supporting
 applications.
 
-A color space may use the following keys:
+A colorspace may use the following keys:
 
 
 ``to_scene_reference`` and ``from_scene_reference``
 ---------------------------------------------------
 
 These keys specify the transforms that define the relationship between
-the color space and the scene-referred reference space.
+the colorspace and the scene-referred reference space.
 
 Note: In OCIO v1, the keys ``to_reference`` and ``from_reference`` were
 used (since there was only one reference space).  These are still supported
@@ -404,10 +404,10 @@ compatibility.
 
 Optional.
 
-These two options are used when OCIO transforms are applied on the
-GPU.
+These two options were used in OCIO v1 when transforms were applied on the
+GPU.  However, the new GPU renderer in OCIO v2 does not need these.
 
-It is also used to automatically generate a "shaper LUT" when
+However, they may still be used to automatically generate a "shaper LUT" when
 :ref:`baking LUT's <userguide-bakelut>` unless one is explicitly
 specified (not all output formats utilise this)
 
@@ -469,6 +469,8 @@ It's common to use literal ``|`` block syntax to preserve all newlines:
         This is the second.
 
 
+.. _config-display-colorspaces:
+
 Display Colorspaces
 *******************
 
@@ -477,7 +479,7 @@ Display Colorspaces
 
 Optional.
 
-This section is a list of all the display colorspaces known to OCIO. 
+This section is a list of all the display-referred colorspaces in the config. 
 A display colorspace is very similar to a colorspace except its transforms
 go from or to the display-referred reference space rather than the
 scene-referred reference space.
diff --git a/docs/guides/authoring/overview.rst b/docs/guides/authoring/overview.rst
index 7df06e669c..7d96a97508 100644
--- a/docs/guides/authoring/overview.rst
+++ b/docs/guides/authoring/overview.rst
@@ -18,6 +18,12 @@ This page alone will not help you to write a useful config file! See
 the :ref:`configurations` section for examples of complete, practical
 configs, and discussion of how they fit within a facilities workflow.
 
+Please note that you should use the OCIO Python or C++ API to generate
+the config.ocio file rather than writing the YAML by hand in a text editor.
+However, if you do ever modify YAML by hand rather than via the API, you
+should run :ref:`overview-ociocheck` on it to ensure that the syntax is
+correct.
+
 YAML basics
 ***********
 
@@ -29,7 +35,7 @@ has a good overview.
 
 OCIO configs typically use a small subset of YAML, so looking at
 existing configs is probably the quickest way to familiarise yourself
-(just remember the indentation is important!)
+(just remember the indentation is important!).
 
 Checking for errors
 *******************
@@ -59,8 +65,12 @@ An OCIO config has the following sections:
 * :ref:`config-displays-views` -- This section defines how color spaces should be viewed.
 * :ref:`config-looks` -- Looks are transforms used to adjust colors, such as to apply a
   creative effect.
-* :ref:`config-colorspaces` -- This section defines the universe of color space encodings
+* :ref:`config-colorspaces` -- This section defines the scene-referred color space encodings
   available within the config.
+* :ref:`config-display-colorspaces` -- This section defines the display-referred color space
+  encodings available within the config.
+* :ref:`config-named-transforms` -- Named Transforms are a way to provide transforms that
+  do not have a fixed relationship to a specific reference space, such as a utility curve.
 
 A collection of :ref:`config-transforms` is provided for use in the various sections
 of the config file.
@@ -99,11 +109,12 @@ Optional. A brief description of the configuration.
 ``name``
 ^^^^^^^^
 
-Optional. A unique name for the config.
+Optional. A unique name for the config.  Future versions of OCIO might use this as a
+sort of "namespace" for the color spaces defined in the rest of the config.
 
 .. code-block:: yaml
 
-    name: foo_2021-02-01
+    name: studio-config-v1.0.0_aces-v1.3_ocio-v2.1
 
 
 ``search_path``
diff --git a/docs/guides/authoring/rules.rst b/docs/guides/authoring/rules.rst
index 3c557dbf4a..8fbc35775d 100644
--- a/docs/guides/authoring/rules.rst
+++ b/docs/guides/authoring/rules.rst
@@ -16,7 +16,9 @@ File & Viewing rules
 ``file_rules``
 ^^^^^^^^^^^^^^
 
-Either file_rules or the default role are Required.
+.. warning::
+    Either the file_rules section or the default role are Required for configs of
+    version 2 or higher.
 
 Use the File Rules to assign a default color space to files based on their path.
 
@@ -24,10 +26,10 @@ Here is example showing the various types of rules that may be defined:
 
 .. code-block:: yaml
 
-  files_rules:
+  file_rules:
     - !<Rule> {name: LogC, extension: "*", pattern: "*LogC*", colorspace: ARRI LogC}
     - !<Rule> {name: OpenEXR, extension: "exr", pattern: "*", colorspace: ACEScg}
-    - !<Rule> {name: TIFF, regex: ".*\.TIF?F$", colorspace: sRGB}
+    - !<Rule> {name: TIFF, regex: ".*\\.TIF?F", colorspace: sRGB}
     - !<Rule> {name: ColorSpaceNamePathSearch}
     - !<Rule> {name: Default, colorspace: default}
 
@@ -42,36 +44,41 @@ other keys depend on the rule type.
 This is the basic rule type that uses Unix glob style pattern matching and is 
 thus very easy to use. It contains the keys:
 
-* ``name``: Name of the rule
+* ``name``: Name of the rule.
 * ``pattern``: Glob pattern to be used for the main part of the name/path.
-  This is case-sensitive.
+  This is case-sensitive.  It must be in double-quotes.  Set it to "*" if you only 
+  want the rule to consider the extension.
 * ``extension``: Glob pattern or string to be used for the file extension. Note that
   if glob tokens are not used, the extension will be used in a non-case-sensitive 
   way by default.  For example the simple string "exr" would match "exr" and "EXR".  
-  If you only want to match "exr", use the glob pattern "[e][x][r]".
+  If you only want to match "exr", use the glob pattern "[e][x][r]".  It must be 
+  in double-quotes.  Set it to "*" if you only want the rule to consider the pattern.
 * ``colorspace``: ColorSpace name to be returned.
 
-2. Regex Rules -- 
+2. RegEx Rules -- 
 This is similar to the basic rule but allows additional capabilities for 
 power-users. It contains the keys:
 
-* ``name``: Name of the rule
-* ``regex``: Regular expression to be evaluated.
+* ``name``: Name of the rule.
+* ``regex``: Regular expression to be evaluated.  It must be in double-quotes.
 * ``colorspace``: ColorSpace name to be returned.
 
+Note that a backslash character in a RegEx expression needs to be doubled up as ``\\``
+(as shown in the example above for the TIFF rule) to make it through the Yaml parsing.
+
 3. OCIO v1 style Rule -- 
 This rule allows the use of the OCIO v1 style, where the string is searched for 
 ColorSpace names from the config. This rule may occur 0 or 1 times in the list. 
 The position in the list prioritizes it with respect to the other rules. It has 
 the key:
 
-* ``name``: Must be "ColorSpaceNamePathSearch".
+* ``name``: Must be ``ColorSpaceNamePathSearch``.
 
 4. Default Rule -- 
-The file_rules must always end with this rule. If no prior rules match, this 
+The file_rules section must always end with this rule. If no prior rules match, this 
 rule specifies the ColorSpace applications will use. It has the keys:
 
-* ``name``: must be "Default".
+* ``name``: must be ``Default``.
 * ``colorspace``: ColorSpace name to be returned.
 
 Note: OCIO v1 defined a ``default`` role intended to specify a default color space
@@ -84,19 +91,24 @@ missing, an exception will be thrown when loading the config.
 
 Note that the strictparsing token does not affect the behavior of the File Rules 
 API. In other words, evaluating the rules will always result in a ColorSpace being 
-available to an application. However, the API alsos allow the application to know 
+available to an application. However, the API also allows the application to know 
 which rule was the matching one. So apps that want to work in "strict" mode should 
 first check if strictparsing is true and if so check to see if the matching rule 
 was the Default Rule. If so, it could then notify the user and take whatever action 
-is appropriate.
+is appropriate.  (As an alternative to checking which rule number was matched, the 
+API call ``filepathOnlyMatchesDefaultRule`` may be used instead.)
 
 Roles may be used rather than ColorSpace names in the rules.
 
-It is also legal for rules  to have additional key:value pairs where the value 
+It is also legal for rules to have additional key:value pairs where the value 
 may be an arbitrary string. The API provides access to getting/setting these 
 additional pairs and will preserve them on a Config read/write.  These may be
 used to define application-specific behavior.
 
+Note to developers: The older ``parseColorSpaceFromString`` API call is now deprecated
+and should be replaced with ``getColorSpaceFromFilepath``.
+
+
 ``strictparsing``
 ^^^^^^^^^^^^^^^^^
 
@@ -107,12 +119,21 @@ Optional. Valid values are ``true`` and ``false``. Default is ``true``
 
     strictparsing: true
 
-OCIO provides a mechanism for applications to extract the colorspace
-from a filename (the ``parseColorSpaceFromString`` API method)
+.. warning::
+    This attribute is from OCIO v1.  In OCIO v2, the FileRules system was
+    introduced and so the ``strictparsing`` attribute is less relevant now.
+    The ``parseColorSpaceFromString`` API call is now deprecated and the
+    proper way to obtain this information is ``getColorSpaceFromFilepath``.
+    The FileRules always return a default color space but the API
+    ``filepathOnlyMatchesDefaultRule`` may be used by applications that
+    want to take some special action if ``strictparsing`` is true.
+
+OCIO v1 provided a mechanism for applications to extract the colorspace
+from a filename (the ``parseColorSpaceFromString`` API method).
 
 So for a file like ``example_render_v001_lnf.0001.exr`` it will
 determine the colorspace ``lnf`` (it being the right-most substring
-containing a colorspace name)
+containing a colorspace name).
 
 However, if the colorspace cannot be determined and ``strictparsing:
 true``, it will return an empty string.
diff --git a/docs/guides/authoring/transforms.rst b/docs/guides/authoring/transforms.rst
index 51dbbe904a..8a5a36db83 100644
--- a/docs/guides/authoring/transforms.rst
+++ b/docs/guides/authoring/transforms.rst
@@ -344,6 +344,7 @@ Keys:
     must also be present and the result is clamped at the high end.
 
 
+.. _config-named-transforms:
 
 Named Transforms
 ****************
diff --git a/docs/guides/contributing/documentation_guidelines.rst b/docs/guides/contributing/documentation_guidelines.rst
index 6749cdc3bd..2cb6758b16 100644
--- a/docs/guides/contributing/documentation_guidelines.rst
+++ b/docs/guides/contributing/documentation_guidelines.rst
@@ -14,18 +14,39 @@ The documentation primarily lives in the ``docs/`` folder, within the
 main OpenColorIO repository.
 
 The rST source for the C++ API documentation is extracted from
-comments in the public header files in ``export/``
+comments in the public header files in ``include/``
 
-The Python API documentation is extracted from dummy .py files within
-the ``src/pyglue/DocStrings/`` folder
+Installation of requirements
+****************************
+
+Scripts are available, for each platform, to install the documentation 
+requirements.
+
+The ``install_docs_env.sh`` script in the share/ci/scripts/<Platform> directory
+will install the Python-related requirements for building the documentation
+(Sphinx, six, testresources, recommonmark, sphinx-press-theme, sphinx-tabs,
+and breathe) and Doxygen.  
+
+Use GitBash (`provided with Git for Windows <https://gitforwindows.org/>`_) to 
+execute the script on Windows.
+
+Python 3 is required to build the documentation. If you have multiple Python
+installs you'll need to make sure pip and CMake find the correct version. You 
+can manually inform CMake of which to use by adding this option to the below 
+`cmake` command, which configures the documentation build:
+
+    -DPython_ROOT=<Path to Python 3 root directory>
+
+For the Python packages, ensure their locations are in your ``PYTHONPATH`` 
+environment variable prior to configuring the build.
 
 Building the docs
 *****************
 
-Just like a :ref:`regular build from source <building-from-source>`,
-but specify the ``-D OCIO_BUILD_DOCS=yes`` argument to CMake.
+The build is just like a :ref:`regular build from source <building-from-source>`,
+but specify the ``-D OCIO_BUILD_DOCS=ON`` argument to CMake.
 
-Then run the ``make doc`` target. The default HTML output will be
+Then run the ``make docs`` target. The default HTML output will be
 created in ``build_dir/docs/build-html/``
 
 Note that CMake must be run before each invocation of ``make`` to copy
@@ -37,7 +58,50 @@ Initial run::
 
 Then after each change you wish to preview::
 
-    $ cmake -D OCIO_BUILD_DOCS=yes .. && make doc
+    $ cmake -D OCIO_BUILD_DOCS=ON .. && make docs
+
+Updating the Python docs
+************************
+
+If a contributor makes changes to any part of OCIO which affects the Python API docs 
+(so, public headers, Python bindings, any documentation process code, etc.) they should 
+do a local build with the new CMake option -DOCIO_BUILD_FROZEN_DOCS=ON, and add the 
+modified rST files under docs/api/python/frozen to their PR.
+
+Note: If you run the scripts on Linux, the freezing process should work well.  On other 
+platforms, the process may sometimes make spurious deltas to rST files unrelated to your 
+changes.  Please don't add these files to your PR.
+
+The OCIO conf.py module has a switch that detects when docs are being built on GH Actions 
+(CI env var == true) it will backup the frozen folder to a sibling backup folder on Sphinx 
+init, and following Sphinx build completion will do a file-by-file comparison of the new 
+frozen and the backup folders. If there are differences, the CI job may fail with an error 
+explaining where the differences were found and with instructions on how to fix them.
+
+The conf.py also has a switch that detects when it is being run on RTD, and in that case 
+will itself run Doxygen to generate the XML needed by breathe prior to building the docs, 
+and will also facilitate a CMake configure_file-like process (via Python) to handle 
+substitutions in headers and docs source files that CMake would usually handle, but can't 
+in this case. One potential plus to all of this is that if someone wants to just build 
+OCIO docs, they can technically do so by running sphinx-build in the docs directory, and 
+nothing more. Right now that only works when the READTHEDOCS env var == True, but it could 
+be easily exposed another way if needed.
+
+These features required several custom Sphinx extensions tuned for our project which are
+located under share/docs.
+
+Building the docs -- Excluding the API docs
+*******************************************
+
+If you don't need to build the API documentation, there is a quick and dirty way to 
+do a docs build.  This approach does not need to compile the C++ code but is not ideal
+since it modifies files in the source directory rather than the build directory:
+
+    export READTHEDOCS=True
+    cd docs  (in the source directory)
+    mkdir _build
+    sphinx-build -b html . _build
+    <your web browser name> _build/index.html
 
 Basics
 ******
@@ -79,6 +143,30 @@ Basics
 
       In order to bake a LUT, ...
 
+Quirks
+******
+
+The vuepress theme that we've migrated to has some quirks to its design. For
+example, it only allows two nested table of contents (TOC). So things have to be
+organized in a slightly different way than other sphinx projects.
+
+The root-level `toc_redirect.rst` points to where to find the different section
+TOCs. The name and contents of each sections TOC is defined in that
+sub-directory's `_index.rst` file.
+
+In this TOC the `:caption:` directive determines what the name of the section
+will be in the sidebar, and in the header of the website. The *H1* header
+determines the name of the page in the right/left arrows navigation bar. In a
+lot of cases this ends up doubling up the name on the page, but this seems
+unavoidable at the present time. If additional explanatory text is put in the
+`_index.rst` files then it shouldn't be as problematic.
+
+The site will show all *H1* headers in the side panel by default, these then
+expand when selected to show all *H2* headers.
+
+Due to the limited TOC and sidebar depth, we shouldn't be afraid of looong
+pages with many *H2* headings to break down the page into logical quadrants.
+
 Emacs rST mode
 **************
 
diff --git a/docs/guides/contributing/repository_structure.rst b/docs/guides/contributing/repository_structure.rst
index 372dafe7ed..fb04e99543 100644
--- a/docs/guides/contributing/repository_structure.rst
+++ b/docs/guides/contributing/repository_structure.rst
@@ -45,32 +45,32 @@ continuous integration (CI) workflows for all supported platforms. Each .yml
 file defines one GHA workflow, which can contain one or more jobs and the 
 repository events that trigger the workflow.
 
+ASWF
+****
+
+This subdirectory contains important ASWF governance documents like the 
+OpenColorIO charter, CLAs and DCO reference.
+
+The ``meetings`` subdirectory contains historical meeting minutes for 
+TSC (Technical Steering Committee) and working group meetings.  More 
+recent TSC and working group meeting notes are stored on the 
+`ASWF Wiki. <https://wiki.aswf.io/display/OCIO/Meetings>`_
+
 docs
 ****
 
 This directory contains the OpenColorIO documentation source and build 
 configuration. 
 
-The ``guides/contributing/tsc`` subdirectory contains meeting minutes for all 
-recorded TSC (Technical Steering Committee) and working group meetings. The 
-``guides/contributing/aswf`` subdirectory contains important ASWF governance 
-documents like the OpenColorIO charter, CLAs and DCO reference.
-
 See :ref:`documentation-guidelines` for information on contributing to OCIO
 documentation.
 
 ext
 ***
 
-This directory serves two purposes:
-
-1. House modified external dependencies which are permissible to store within 
-   the OpenColorIO repository. Permissible dependencies have compatible FOSS
-   licenses which have documented approval by the ASWF governing board.
-
-2. Location for external dependencies temporarily installed and statically 
-   linked by OpenColorIO when using the CMake ``OCIO_INSTALL_EXT_PACKAGES`` 
-   option with the value ``MISSING`` or ``ALL``.
+Houses modified external dependencies which are permissible to store within 
+the OpenColorIO repository. Permissible dependencies have compatible FOSS
+licenses which have documented approval by the ASWF governing board.
 
 include
 *******
diff --git a/docs/guides/contributing/submitting_changes.rst b/docs/guides/contributing/submitting_changes.rst
index 261b2409cf..a818742f1b 100644
--- a/docs/guides/contributing/submitting_changes.rst
+++ b/docs/guides/contributing/submitting_changes.rst
@@ -13,8 +13,8 @@ Code Review
 Ask early, and ask often!
 
 All new contributors are highly encouraged to post development ideas, questions,
-or any other thoughts to the :ref:`mailing_lists` before starting to code. This
-greatly improves the process and quality of the resulting library. Code
+or any other thoughts to :ref:`slack` or the :ref:`mailing_lists` before starting to
+code. This greatly improves the process and quality of the resulting library. Code
 reviews (particularly for non-trivial changes) are typically far simpler if the
 reviewers are aware of a development task beforehand. (And, who knows? Maybe they
 will have implementation suggestions as well!)
@@ -73,6 +73,12 @@ assistance.
     git add .
     git commit -s -m 'Implement my feature'
 
+* If your PR changes anything that appears in the public API documentation,
+  you should follow the directions below in "Updating the Python docs".
+  If this proves problematic for you, please reach out for help on Slack.
+  One of the other developers will probably be able to make the updates
+  for you in another PR.
+
 * Push your changes back to origin (your fork)::
 
     git push -u origin myFeature
diff --git a/docs/guides/using_ocio/tool_overview.rst b/docs/guides/using_ocio/tool_overview.rst
index 87ad7d3974..6e5be4c81e 100644
--- a/docs/guides/using_ocio/tool_overview.rst
+++ b/docs/guides/using_ocio/tool_overview.rst
@@ -24,6 +24,26 @@ Note that some tools depend on OpenEXR or OpenImageIO and other libraries:
 .. TODO: check app lib dependencies
 .. TODO: make a pretty table in RST.
 
+.. _overview-ocioarchive:
+
+ocioarchive
+***********
+
+This command-line tool allows you to convert a config and its external LUT files
+into an OCIOZ archive file.  A .ocioz file may be supplied to any command that
+takes the path to a config or set as the OCIO environment variable.
+
+This example creates a file called myarchive.ocioz::
+
+    $ ocioarchive myarchive --iconfig myconfig/config.ocio
+
+This command will expand it back out::
+
+    $ ocioarchive --extract myarchive.ocioz
+
+The --list option may be used to see the contents of a .ocioz file.
+
+
 .. _overview-ociocheck:
 
 ociocheck
@@ -33,7 +53,7 @@ This is a command-line tool which shows an overview of an OCIO config
 file, and checks for obvious errors.
 
 For example, the following shows the output of a config with a typo -
-the colorspace used for ``compositing_log`` is not incorrect::
+the colorspace used for ``compositing_log`` is incorrect::
 
     $ ociocheck --iconfig example.ocio
 
@@ -189,6 +209,17 @@ not the case.  Also, it does not leverage any of the new OCIO v2 features.
 .. TODO: Link to discussion of OpenImageIO source?
 
 
+.. _overview-pyociodisplay:
+
+pyociodisplay
+*************
+
+The pyociodisplay tool is a minimal image viewer implementation demonstrating use of 
+the OCIO GPU renderer in a Python application.  It requires downloading a few dependencies
+before use.  For more information, please see the 
+`README. <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/src/apps/pyociodisplay>`_ 
+
+
 .. _overview-ociolutimage:
 
 ociolutimage
@@ -240,6 +271,7 @@ ACES2065-1::
 The --list argument will print out all of the standard ACES color spaces that are 
 supported as --csc arguments.
 
+
 .. _overview-ocioperf:
 
 ocioperf
@@ -272,6 +304,7 @@ Examples::
 
 .. TODO: examples formatting
 
+
 .. _overview-ociowrite:
 
 ociowrite
@@ -298,3 +331,16 @@ Here is an example::
 
     $ export OCIO=/path/to/the/config.ocio
     $ ociowrite --colorspaces acescct aces2065-1 --file mytransform.ctf
+
+
+.. _overview-pyocioamf:
+
+pyocioamf
+*********
+
+The pyocioamf tool is an initial attempt to support the ACES Metadata File (AMF)
+`format. <https://docs.acescentral.com/guides/amf/>`_
+This Python script will take an AMF file and produce an OCIO CTF file that implements its color
+pipeline.  The CTF file may be applied to images using tools such as :ref:`overview-ocioconvert`.
+For more information, please see the 
+`README. <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/src/apps/pyocioamf>`_ 
diff --git a/docs/index.rst b/docs/index.rst
index e620b9f37b..1241802ea1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -20,31 +20,30 @@ OCIO v1 represented the culmination of years of production experience earned on
 films as SpiderMan 2 (2004), Surf's Up (2007), Cloudy with a Chance of Meatballs
 (2009), Alice in Wonderland (2010), and many more.  
 
-OpenColorIO v2 has been in development since 2017 and was feature complete as of
+OpenColorIO v2 was in development from 2017 to 2020 and was feature complete as of
 SIGGRAPH 2020.  After a stabilization and bug-fixing period, an official 2.0.0 
-release was made in January 2021.  OCIO v2 is in the VFX Reference Platform for
-calendar year 2021.  The OCIO v2 code is in the RB-2.0 branch on 
-`GitHub <https://github.com/AcademySoftwareFoundation/OpenColorIO>`_
-and we encourage developers to start integrating and providing feedback.
-See :ref:`upgrading_to_v2` for more.
+release was made in January 2021.  Development has continued to be active since
+then with the 2.1 release in 2021 and the 2.2 release in 2022 adding even more
+capabilities.  Here is more information about our :ref:`upgrading_to_v2`.
 
 
 About the Documentation
 =======================
 
+The documentation has been updated to include basic coverage of the new OCIO v2 features
+but additional work is needed to provide more detail and tutorials about how to
+best leverage them.  We have an OCIO User Experience (UX) working group that is
+gradually working on more coverage.
+
 The documentation is a work-in-progress and we would love to have your help to 
-improve it!  An easy way to get involved is to join the #docs channel on 
+improve it!  An easy way to get involved is to join the #docs or #ux channel on 
 :ref:`slack`.
 
-The documentation has been updated to include most of the new OCIO v2 features
-but additional work is needed to provide more detail and tutorials about how to
-best leverage them.  We will be working on that over the coming weeks.
-
 Accessing Other Versions
 ************************
 
-You are reading the documentation for OCIO v2.  The documentation for the the previous
-stable release (1.1.1) is available `here <https://opencolorio.org/old/index.html>`_.
+You are reading the documentation for OCIO v2.  The documentation for the earlier
+1.x series of releases is available `here <https://opencolorio.org/old/index.html>`_.
 
 
 Community
diff --git a/docs/quick_start/downloads.rst b/docs/quick_start/downloads.rst
index a2b71cf601..d39f449c57 100644
--- a/docs/quick_start/downloads.rst
+++ b/docs/quick_start/downloads.rst
@@ -7,44 +7,13 @@
 Downloads
 =========
 
-* Sample OCIO Configurations -- `.zip <https://github.com/imageworks/OpenColorIO-Configs/zipball/master>`__ `.tar.gz <https://github.com/imageworks/OpenColorIO-Configs/tarball/master>`__
+* `OCIO v2 ACES Configurations <https://github.com/AcademySoftwareFoundation/OpenColorIO-Config-ACES/releases>`_
+* OCIO v1 Legacy Configurations -- `.zip <https://github.com/imageworks/OpenColorIO-Configs/zipball/master>`__ `.tar.gz <https://github.com/imageworks/OpenColorIO-Configs/tarball/master>`__
 * Reference Images v1.0v4 -- `.tgz <https://code.google.com/p/opencolorio/downloads/detail?name=ocio-images.1.0v4.tgz>`__
-* Core Library v2.0.0 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/archive/v2.0.0.zip>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/archive/v2.0.0.tar.gz>`__
-* Core Library dev -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/main>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/main>`__
-
-Per-version updates: :ref:`changelog-main`.
-
-Build instructions: :ref:`building-from-source`.
+* `OCIO Library source code <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases>`_
 
 .. _contributor-license-agreements:
 
 Contributor License Agreements
 ******************************
-Please see the `OCIO GitHub repository <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/docs/aswf>`__
-
-Deprecated Downloads
-********************
-* Reference Images v1.0v3 `tgz <https://code.google.com/p/opencolorio/downloads/detail?name=ocio-images.1.0v3.tgz>`__
-* Reference Images v1.0v2 `tgz <https://code.google.com/p/opencolorio/downloads/detail?name=ocio-images.1.0v2.tgz>`__
-* Reference Images v1.0v1 `tgz <https://code.google.com/p/opencolorio/downloads/detail?name=ocio-images.1.0v1.tgz>`__
-
-* Core Library v1.1.1 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/archive/v1.1.1.zip>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/archive/v1.1.1.tar.gz>`__
-* Core Library v1.1.0 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.1.0>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.1.0>`__
-* Core Library v1.0.9 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.9>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.9>`__
-* Core Library v1.0.8 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.8>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.8>`__
-* Core Library v1.0.7 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.7>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.7>`__
-* Core Library v1.0.6 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.6>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.6>`__
-* Core Library v1.0.5 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.5>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.5>`__
-* Core Library v1.0.4 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.4>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.4>`__
-* Core Library v1.0.3 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.3>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.3>`__
-* Core Library v1.0.2 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.2>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.2>`__
-* Core Library v1.0.1 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.1>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.1>`__
-* Core Library v1.0.0 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v1.0.0>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v1.0.0>`__
-
-* Color Config v0.7v4 `tgz <https://code.google.com/p/opencolorio/downloads/detail?name=ocio-configs.0.7v4.tgz>`__ (OCIO v0.7.6+)
-* Core Library v0.8.7 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v0.8.7>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v0.8.7>`__
-* Core Library v0.7.9 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v0.7.9>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v0.7.9>`__
-* Core Library v0.6.1 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v0.6.1>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v0.6.1>`__
-* Core Library v0.5.16 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v0.5.16>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v0.5.16>`__
-* Core Library v0.5.8 -- `.zip <https://github.com/AcademySoftwareFoundation/OpenColorIO/zipball/v0.5.8>`__ `.tar.gz <https://github.com/AcademySoftwareFoundation/OpenColorIO/tarball/v0.5.8>`__
-
+Please see the `OCIO GitHub repository <https://github.com/AcademySoftwareFoundation/OpenColorIO/blob/main/docs/aswf/cla_dco.rst>`__
diff --git a/docs/quick_start/for_artists.rst b/docs/quick_start/for_artists.rst
index 7716e4eb63..d3ee1613aa 100644
--- a/docs/quick_start/for_artists.rst
+++ b/docs/quick_start/for_artists.rst
@@ -14,7 +14,12 @@ each application.
 Note that OCIO configurations are required to do any 'real' work, and are
 available separately on the :ref:`downloads` section of this site. Example
 images are also available. For assistance customizing .ocio configurations,
-contact `ocio-user <https://lists.aswf.io/g/ocio-user>`__\.
+contact `ocio-user <https://lists.aswf.io/g/ocio-user>`_ or the #configs
+channel on :ref:`slack`
+
+If your application supports OCIO 2.2 or later, you may take advantage of the
+configurations built into OCIO itself.  For example, by setting the config path to:
+``ocio://default``.
 
 - Step 1:  set the ``$OCIO`` environment-variable to ``/path/to/your/config.ocio``
 - Step 2:  Launch supported application.
@@ -23,6 +28,6 @@ If you are on a platform that is not envvar friendly, most applications also
 provide a menu option to select a different OCIO configuration after launch.
 
 Please be sure to select a configuration that matches your color workflow (VFX 
-work typically requires a different profile than animated features). If you need
+work sometimes requires a different profile than animated features). If you need
 assistance picking a profile, email 
 `ocio-user <https://lists.aswf.io/g/ocio-user>`__\.
diff --git a/docs/quick_start/for_config_authors.rst b/docs/quick_start/for_config_authors.rst
index 87348ac2bb..45fa3cc722 100644
--- a/docs/quick_start/for_config_authors.rst
+++ b/docs/quick_start/for_config_authors.rst
@@ -7,11 +7,10 @@
 Quick Start for Config Authors
 ==============================
 
-Get started by following the :ref:`installation` instructions.  Note that if
-you want to try out the new OpenColorIO v2 features, you'll need to build
-from source.  You will want to make sure to build the command-line tools as
-well as the library itself, so you should install OpenImageIO or OpenEXR before
-building (OCIO is now able to build the latter itself).
+As a config author, you'll want access to the OCIO command-line tools such as 
+``ociocheck``.  Please see the :ref:`installation` instructions for the various
+options, for example Homebrew on macOS and Vcpkg on Windows.  On Linux you may
+need to build from source.
 
 Grab the available configuration files (and the sample images, if you want) from
 :ref:`downloads` so you'll have some examples to study.
diff --git a/docs/quick_start/for_contributors.rst b/docs/quick_start/for_contributors.rst
index d123150228..3b33442d1c 100644
--- a/docs/quick_start/for_contributors.rst
+++ b/docs/quick_start/for_contributors.rst
@@ -15,9 +15,10 @@ testing, example configs, and documentation as well as actual coding.
   issues with the ``good first issue`` label.
 
 * If you want to help develop config files, start by looking at :ref:`quick_start_config_authors`
-  and join the #configs channel on :ref:`slack`.
+  and join the #configs channel on :ref:`slack`.  From there, you may want to join the
+  OCIO Configs working group.
 
-* If you want to help with documentation, join the #docs channel on :ref:`slack`.
+* If you want to help with documentation, join the #docs and #ux channel on :ref:`slack`.
 
 * If you want to help test, join the #general channel on :ref:`slack` and
   let us know you want to help test.
diff --git a/docs/quick_start/for_devs.rst b/docs/quick_start/for_devs.rst
index fb98472ed2..358a5ac2d5 100644
--- a/docs/quick_start/for_devs.rst
+++ b/docs/quick_start/for_devs.rst
@@ -8,10 +8,12 @@ Quick Start for Developers
 ==========================
 
 Get started by following the :ref:`installation` instructions to build OCIO from
-source.  You will want to make sure to build the command-line tools as well as the
-library itself, so you should install OpenImageIO or OpenEXR before building
-(OCIO is now able to build the latter itself).  Note that active development of
-OCIO v2 is happening in the main branch.
+source.  In addition to the library itself, you will want to make sure to build the 
+command-line tools, including ``ocioconvert``.
+
+Note that active development of OCIO v2 is happening in the main branch.
+
+Watch the OCIO SIGGRAPH course listed in :ref:`publications`.
 
 Grab the available configuration files (and the sample images, if you want) from
 :ref:`downloads`.
@@ -22,12 +24,16 @@ the :ref:`overview-ocioconvert` tool to process an image from one color space to
 Try using :ref:`overview-ociodisplay` to view an image through a color space conversion.
 
 Take a look at the example code in the :ref:`developers-usageexamples` and then study
-the code for the command-line tools, which is under src/apps.  
+the code for the command-line tools, which is under ``src/apps``.  
 
-There are helper classes under src/libutils/apphelpers for common application tasks
+There are helper classes under ``src/libutils/apphelpers`` for common application tasks
 such as generating color space menus, building a viewing pipeline for a viewport,
 and writing a color picker for scene-linear color spaces.
 
 Look through the :ref:`concepts_overview` to get a sense of the big picture.
 
 Check out :ref:`upgrading_to_v2` for an overview of the new features in OCIO v2.
+
+Join the #dev and #ux channel on :ref:`slack`.  From the #ux channel, you may want to
+get involved with the OCIO User Experience (UX) working group which discusses best
+practices for implementing OCIO support in applications.
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index d265f5f0e3..600fa89e12 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -5,30 +5,45 @@
 .. _installation:
 
 Installation
-************
+============
+
+Installation may be done either by installing pre-built binaries from various package 
+managers or by building from source.  (The OCIO project currently does not provide
+pre-built libraries as downloadable artifacts other than through package managers.)
+
+Please note that the version available through a given package manager may be significantly 
+outdated when compared to the current official OCIO release, so please verify the version 
+you install is current.  (Unfortunately, even two years after the introduction of 
+OpenColorIO v2, several package managers continue to install OCIO 1.1.1.)
 
-Although OpenColorIO is available through a variety of package managers, please note that the 
-version available through a given package manager may be significantly outdated when compared to 
-the current official OCIO release. Even two years after the introduction of OpenColorIO v2, several 
-package managers continue to install OCIO 1.1.1.
+Alternatives
+************
 
 .. _Python:
 
 Python
-^^^^^^
+++++++
+
+If you only need the Python binding, the simplest solution is to take advantage of the pre-built 
+wheels in the Python Package Index (PyPI) `here <https://pypi.org/project/opencolorio>`_. It 
+can be installed as follows, once you have Python installed.
 
-If you only need the Python bindings, the simplest solution is to take advantage of the pre-built 
-wheels in the Python Package Installer (PyPi) `here <https://pypi.org/project/opencolorio>`__. It 
-can be installed by using this command:: 
+**PyPI**::
 
     pip install opencolorio
 
+The pre-built wheels are listed `here <https://pypi.org/project/opencolorio/#files>`_.  Note that
+source code is provided, so it may be possible for ``pip`` to compile the binding on your machine if
+the matrix of Python version and platform version does not have the combination you need.  More
+detailed instructions are available for how to use 
+`pip. <https://packaging.python.org/en/latest/tutorials/installing-packages/>`_
+
 OpenImageIO
-^^^^^^^^^^^
++++++++++++
 
-If you only need to apply color conversions to images, please note that OpenImageIO's oiiotool has 
-most of the functionality of the ocioconvert command-line tool (although not everything, such as 
-GPU processing). OpenImageIO is available via several package managers (including brew and vcpkg).
+If you only need to apply color conversions to images, please note that OpenImageIO's ``oiiotool`` has 
+most of the functionality of the ``ocioconvert`` command-line tool (although not everything, such as 
+GPU processing). OpenImageIO is available via several package managers (including Homebrew and Vcpkg).
 
 **Homebrew**::
 
@@ -38,16 +53,17 @@ GPU processing). OpenImageIO is available via several package managers (includin
 
     vcpkg install openimageio[opencolorio,tools]:x64-windows --recurse
 
-Installing OpenColorIO using existing packages
-**********************************************
+
+Installing OpenColorIO using Package Managers
+=============================================
 
 Linux
-^^^^^
+*****
 
-When it comes to Linux distributions, relatively few of the Linux distribution repositories have been 
-updated to OCIO v2. The **latest Fedora** is one good option as it offers the most 
-recent release of OpenColorIO v2. Information about the package can be found on 
-`fedoraproject website <https://packages.fedoraproject.org/pkgs/OpenColorIO/OpenColorIO/index.html>`__.
+When it comes to Linux distributions, relatively few of the Linux distribution repositories 
+have been updated to OCIO v2. The **latest Fedora** is one good option as it offers a 
+recent release of OpenColorIO v2. Information about the package can be found on the
+`Fedora project website <https://packages.fedoraproject.org/pkgs/OpenColorIO/OpenColorIO/index.html>`__.
 
 For the other distributions, information about which release of OpenColorIO is available can be 
 verified on `pkgs.org <https://pkgs.org/search/?q=OpenColorIO>`__.
@@ -55,72 +71,88 @@ verified on `pkgs.org <https://pkgs.org/search/?q=OpenColorIO>`__.
 **The recommendation is to build OpenColorIO from source**. You may build from source using the 
 instructions below. See :ref:`building-from-source`.
 
-Windows 7 or newer using vcpkg
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Windows using Vcpkg
+*******************
 
-Vcpkg can be used to install OpenColorIO on Windows. In order to do that, Vcpkg must be installed 
+Vcpkg can be used to install OpenColorIO on Windows 7 or higher. To do that, Vcpkg must be installed 
 by following the `official instructions <https://vcpkg.io/en/getting-started.html>`__. Once Vcpkg 
 is installed, OpenColorIO and some of the tools can be installed with the following command::
 
     vcpkg install opencolorio[tools]:x64-windows
 
-Note that this package **does not** install ocioconvert, ociodisplay, ociolutimage and the Python 
-bindings.
+Note that this package **does not** install ``ocioconvert``, ``ociodisplay``, ``ociolutimage``, 
+and the Python binding.
 
-The three missing tools can be built from source by following the steps in the :ref:`Windows 7 or newer` 
-section while the Python bindings can be install using the pip command in the :ref:`Python` section.
+If you need the extra command-line tools, you'll need to install :ref:`from source. <Windows>` 
+However, the Python binding can be installed as described in the :ref:`Python` section.
 
-OS X using Homebrew
-^^^^^^^^^^^^^^^^^^^
+MacOS using Homebrew
+*******************
 
-You can use the Homebrew package manager to install OpenColorIO on OS X.
+You can use the Homebrew package manager to install OpenColorIO on macOS.
 
 First install Homebrew as per the instructions on the `Homebrew
-homepage <http://mxcl.github.com/homebrew/>`__ (or see the `Homebrew wiki
-<https://github.com/mxcl/homebrew/wiki/Installation>`__ for more
-detailed instructions)
+homepage <https://brew.sh/>`__ (or see the `Homebrew documentation
+<https://docs.brew.sh/>`__ for more detailed instructions).
 
 Then simply run the following command to install::
 
     brew install opencolorio
 
-Homebrew does not install the Python binding or the command-line tools that depend on OpenImageIO 
-such as ocioconvert, ociodisplay and ociolutimage.
+Homebrew does not install the Python binding or the command-line tools that depend on either
+OpenImageIO or OpenEXR such as ``ocioconvert``, ``ociodisplay``, and ``ociolutimage``.
 
 .. _building-from-source:
 
-Building from source
-********************
+Building from Source
+====================
+
+The basic requirements to build OCIO from source on Linux, macOS, and Windows are a
+supported C++ compiler, CMake, and an internet connection.  The OCIO Cmake scripts are
+able to download, build, and install all other required components.  However, more 
+advanced users are also able to have the build use their own version of dependencies.
 
 Dependencies
-^^^^^^^^^^^^
+************
 
-The basic requirements for building OCIO are the following.  Note that, by
-default, cmake will try to install all of the items labelled with * and so
-it is not necessary to install those items manually:
+OCIO depends on the following components.  By default, the OCIO CMake scripts will 
+download and build the items labelled with a \*, so it is not necessary to install those 
+items manually:
 
-- cmake >= 3.12
+Required components:
+
+- C++ 11-17 compiler (gcc, clang, msvc)
+- CMake >= 3.13
 - \*Expat >= 2.4.1 (XML parser for CDL/CLF/CTF)
 - \*yaml-cpp >= 0.7.0 (YAML parser for Configs)
 - \*Imath >= 3.0 (for half domain LUTs)
 - \*pystring >= 1.1.3
+- \*minizip-ng >= 3.0.7 (for config archiving)
+- \*ZLIB >= 1.2.13 (for config archiving)
 
-Some optional components also depend on:
+Optional OCIO functionality also depends on:
 
 - \*Little CMS >= 2.2 (for ociobakelut ICC profile baking)
-- \*pybind11 >= 2.9.2 (for the Python bindings)
-- Python >= 2.7 (for the Python bindings)
-- Python 3.7 or 3.8 (for the docs, with the following PyPi packages)
-    - Sphinx
-    - six
-    - testresources
-    - recommonmark
-    - sphinx-press-theme
-    - sphinx-tabs
-    - breathe
-- NumPy (for complete Python test suite)
+- \*OpenGL GLUT & GLEW (for ociodisplay)
+- \*OpenEXR >= 3.0 (for apps including ocioconvert)
+- OpenImageIO >= 2.1.9 (for apps including ocioconvert)
+- \*OpenFX >= 1.4 (for the OpenFX plug-ins)
+- OpenShadingLanguage >= 1.11 (for the OSL unit tests)
 - Doxygen (for the docs)
-- OpenImageIO >= 2.1.9 or OpenEXR >= 3.0 (for apps including ocioconvert)
+- NumPy (optionally used in the Python test suite)
+- \*pybind11 >= 2.9.2 (for the Python binding)
+- Python >= 2.7 (for the Python binding only)
+- Python 3.7 - 3.9 (for building the documentation)
+
+Building the documentation requires the following packages, available via PyPI:
+
+- Sphinx
+- six
+- testresources
+- recommonmark
+- sphinx-press-theme
+- sphinx-tabs
+- breathe
 
 Example bash scripts are provided in 
 `share/ci/scripts <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/ci/scripts>`_ 
@@ -132,14 +164,14 @@ available for all supported platforms. Use GitBash
 this script on Windows.
 
 Automated Installation
-----------------------
+++++++++++++++++++++++
 
-Listed dependencies with a preceeding * can be automatically installed at 
-build time using the ``OCIO_INSTALL_EXT_PACKAGES`` option in your cmake 
-command (requires an internet connection).  This is the default.  C/C++ 
-libraries are pulled from external repositories, built, and statically-linked 
-into libOpenColorIO. Python packages are installed with ``pip``. All installs 
-of these components are fully contained within your build directory.
+Dependencies listed above with a preceeding * can be automatically installed at 
+build time using the ``OCIO_INSTALL_EXT_PACKAGES`` option in your ``cmake`` 
+command (requires an internet connection).  The C/C++ libraries are pulled from 
+external repositories, built, and are (typically) statically-linked into an OCIO
+dynamic library.  All installs of these components are fully contained within your 
+build directory.
 
 Three ``OCIO_INSTALL_EXT_PACKAGES`` options are available::
 
@@ -153,37 +185,130 @@ Three ``OCIO_INSTALL_EXT_PACKAGES`` options are available::
   current system.
 
 Existing Install Hints
-----------------------
-
-When using existing system libraries, the following CMake variables can be 
-defined to hint at non-standard install locations and preference of shared
-or static linking:
-
-- ``-Dexpat_ROOT=<path>`` (include and/or library root dir)
-- ``-Dexpat_STATIC_LIBRARY=ON`` (prefer static lib)
-- ``-Dyaml-cpp_ROOT=<path>`` (include and/or library root dir)
-- ``-Dyaml-cpp_STATIC_LIBRARY=ON`` (prefer static lib)
-- ``-DImath_ROOT=<path>`` (include and/or library root dir)
-- ``-DImath_STATIC_LIBRARY=ON`` (prefer static lib)
-- ``-DHalf_ROOT=<path>`` (include and/or library root dir)
-- ``-DHalf_STATIC_LIBRARY=ON`` (prefer static lib)
-- ``-Dpystring_ROOT=<path>`` (include and/or library root dir)
-- ``-Dpystring_STATIC_LIBRARY=ON`` (prefer static lib)
-- ``-Dlcms2_ROOT=<path>`` (include and/or library root dir)
-- ``-Dlcms2_STATIC_LIBRARY=ON`` (prefer static lib)
-- ``-Dpybind11_ROOT=<path>`` (include and/or library root dir)
-- ``-DPython_EXECUTABLE=<path>`` (Python executable)
-
-To hint at Python package locations, add paths to the ``PYTHONPATH`` 
-environment variable prior to configuring the build.
+++++++++++++++++++++++
+
+When using libraries already on your system, the CMake variable 
+``-D <Package Name>_ROOT=<Path>`` may be used to specify the path to the include and 
+library root directory rather than have CMake try to find it.  The package names used 
+by OCIO are as follows (note that these are case-sensitive):
+
+Required:
+
+- ``expat``
+- ``yaml-cpp``
+- ``Imath``
+- ``pystring``
+- ``ZLIB``
+- ``minizip-ng``
+
+Optional:
+
+- ``OpenEXR``
+- ``OpenImageIO``
+- ``lcms2``
+- ``pybind11``
+- ``openfx``
+- ``OSL``
+- ``Sphinx``
+- ``GLEW``
+- ``GLUT``
+- ``Python``
+
+There are scenarios in which some of the dependencies may not be compiled into an 
+OCIO dynamic library.  This is more likely when OCIO does not download the packages
+itself.  In these cases, it may be helpful to additionally specify the CMake variable
+``-D <Package Name>_STATIC_LIBRARY=ON``. The following package names support this hint:
+``expat``, ``yaml-cpp``, ``Imath``, ``lcms2``, ``ZLIB``, and ``minizip-ng``.
+
+Rather than using ``_ROOT``, and possibly ``_STATIC_LIBRARY``, you may instead use
+``-D <Package Name>_LIBRARY=<Path>`` and ``-D <Package Name>_INCLUDE_DIR=<Path>``.
+In this case, the library path will control whether a static or dynamic library is used.
+It may also be used to handle situations where the library and/or include files are not
+in the typical location relative to the root directory.
+
+The OCIO `CMake find modules <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/cmake/modules>`_ 
+may be consulted for more detail on the handling of a given package and the CMake
+variables it uses.
+
+Please note that if you provide your own ``minizip-ng``, rather than having OCIO's CMake
+download and build it, you will likely need to set its CMake variables the same way
+that OCIO does (e.g., enable ZLib and turn off most other options).  Using a ``minizip-ng``
+from various package managers (e.g., Homebrew) probably won't work.  Please see the
+settings that begin with ``-DMZ_`` that are used in the OCIO 
+`minizip-ng find module. <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/cmake/modules/Findminizip-ng.cmake>`_ 
+
+Please note that if you build a static OCIO library, it will not contain the libraries 
+for the external packages and so you will need to list those separately when linking your
+client application.  If you had OCIO download and build these packages, you will
+find them under your build directory in ``ext/dist/lib``.  The libraries that are
+needed are: ``expat``, ``yaml-cpp``, ``Imath``, ``pystring``, ``ZLIB``, and ``minizip-ng``.
+
+The OCIO ``make install`` step will install CMake configuration files that may be used
+by applications that consume OCIO to find and utilize OCIO during their own build process.
+
+For the Python packages required for the documentation, ensure that their locations
+are in your ``PYTHONPATH`` environment variable prior to configuring the build.
+
+This custom variable is also available:
+
+- ``-DPython_EXECUTABLE=<path>`` (Python executable for pybind11)
+
+
+.. _enabling-optional-components:
+
+Enabling optional components
+****************************
+
+CMake Options
++++++++++++++
+
+There are many options available in `CMake. 
+<https://cmake.org/cmake/help/latest/guide/user-interaction/index.html#guide:User%20Interaction%20Guide>`_
+
+Several of the most common ones are:
+
+- ``-DCMAKE_BUILD_TYPE=Release`` (Set to Debug, if necessary)
+- ``-DBUILD_SHARED_LIBS=ON`` (Set to OFF to build OCIO as a static library)
+
+Here are the most common OCIO-specific CMake options (the default values are shown):
+
+- ``-DOCIO_BUILD_APPS=ON`` (Set to OFF to not build command-line tools)
+- ``-DOCIO_USE_OIIO_FOR_APPS=OFF`` (Set ON to build tools with OpenImageIO rather than OpenEXR)
+- ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding)
+- ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins)
+- ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations)
+- ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests)
+- ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests)
+- ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering)
+- ``-DOCIO_WARNING_AS_ERROR=ON`` (Set to OFF to turn off warnings as errors)
+- ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation)
+- ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation)
+
+Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
+If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
+and therefore you will be limited to using OpenEXR files with these tools rather than the
+wider range of image file formats supported by OIIO.  (Using OpenEXR for these tools works
+around the issue of a circular dependency between OCIO and OIIO that can complicate some
+build chains.)
+
+The CMake output prints information regarding which image library will be used for the
+command-line tools (as well as a lot of other info about the build configuration).
+
+
+Documentation
++++++++++++++
+
+Instructions for installing the documentation pre-requisites and building the docs 
+are in the section on :ref:`contributing documentation. <documentation-guidelines>`
+
 
 .. _osx-and-linux:
 
-OS X and Linux
-^^^^^^^^^^^^^^
+MacOS and Linux
+***************
 
 While there is a huge range of possible setups, the following steps
-should work on OS X and most Linux distros. To keep things simple, this guide 
+should work on macOS and most Linux distros. To keep things simple, this guide 
 will use the following example paths - these will almost definitely be 
 different for you:
 
@@ -196,12 +321,12 @@ First make the build directory and cd to it::
     $ mkdir /tmp/ociobuild
     $ cd /tmp/ociobuild
 
-Next step is to run cmake, which looks for things such as the
+Next step is to run ``cmake``, which looks for things such as the
 compiler's required arguments, optional requirements like Python,
 OpenImageIO etc
 
 For this example we will show how to install OCIO to a custom location 
-(instead of the default ``/usr/local``), we will thus run cmake with
+(instead of the default ``/usr/local``), we will thus run ``cmake`` with
 ``CMAKE_INSTALL_PREFIX``.
 
 Still in ``/tmp/ociobuild``, run::
@@ -209,8 +334,8 @@ Still in ``/tmp/ociobuild``, run::
     $ cmake -DCMAKE_INSTALL_PREFIX=/software/ocio /source/ocio
 
 The last argument is the location of the OCIO source code (containing
-the main CMakeLists.txt file). You should see something along the
-lines of::
+the main CMakeLists.txt file). You should see it conclude with something
+along the lines of::
 
     -- Configuring done
     -- Generating done
@@ -231,25 +356,33 @@ the specified location::
     $ make install
 
 If nothing went wrong, ``/software/ocio`` should look something like
-this::
+this (on Linux or macOS)::
 
     $ cd /software/ocio
     $ ls
-    bin/     include/ lib/
+    bin/  include/  lib/  share/
     $ ls bin/
     ociobakelut ociocheck  (and others ...)
     $ ls include/
-    OpenColorIO/   PyOpenColorIO/ pkgconfig/
+    OpenColorIO/
     $ ls lib/
-    libOpenColorIO.a      libOpenColorIO.dylib
+    cmake/  libOpenColorIO.dylib  (and some more specific versions ...) 
+    libOpenColorIOimageioapphelpers.a  libOpenColorIOoglapphelpers.a
+    pkgconfig/  python<version>/
+    $ ls lib/pkgconfig
+    OpenColorIO.pc
+    $ ls lib/python<version>/site-packages
+    PyOpenColorIO.so
+    $ ls share/ocio
+    setup_ocio.sh
 
-.. _Windows 7 or newer:
+.. _Windows:
 
-Windows 7 or newer
-^^^^^^^^^^^^^^^^^^
+Windows
+*******
 
 While build environments may vary between users, the recommended way to build OCIO from source on 
-Windows is to use the scripts provided in the Windows 
+Windows 7 or newer is to use the scripts provided in the Windows 
 `share <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/dev/windows>`_ 
 section of the OCIO repository. There are two scripts currently available. 
 
@@ -270,7 +403,7 @@ Run this command to execute the ocio_deps.bat script::
 The second script is called 
 `ocio.bat <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/dev/windows/ocio.bat>`_ 
 and it provide a way to configure and build OCIO from source. Moreover, this script executes the 
-install step of cmake as well as the unit tests. The main use case is the following::
+install step of ``cmake`` as well as the unit tests. The main use case is the following::
 
     ocio.bat --b <path to build folder> --i <path to install folder> 
     --vcpkg <path to vcpkg installation> --ocio <path to ocio repository> --type Release
@@ -282,39 +415,14 @@ For more information, please look at each script's documentation::
 
     ocio_deps.bat --help
 
-.. _enabling-optional-components:
-
-Enabling optional components
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The OpenColorIO library is probably not all you want - the Python
-libraries bindings, the Nuke nodes and several applications are only
-built if their dependencies are found.
-
-In the case of the Python bindings, the dependencies are the Python
-headers for the version you wish to use. These may be picked up by
-default - if so, when you run cmake you would see::
-
-    -- Python 2.6 okay, will build the Python bindings against .../include/python2.6
-
-If not, you can point cmake to correct Python executable using the
-``-D PYTHON=...`` cmake flag::
-
-    $ cmake -D PYTHON=/my/custom/python2.6 /source/ocio
-
-The applications included with OCIO have various dependencies - to
-determine these, look at the CMake output when first run::
-
-    -- Not building ocioconvert. Requirement(s) found: OIIO:FALSE
-
-
 .. _quick-env-config:
 
 Quick environment configuration
-*******************************
+===============================
 
 The quickest way to set the required :ref:`environment-setup` is to
 source the ``share/ocio/setup_ocio.sh`` script installed with OCIO.
+On Windows, use the corresponding setup_ocio.bat file.
 
 For a simple single-user setup, add the following to ``~/.bashrc``
 (assuming you are using bash, and the example install directory of
@@ -336,7 +444,7 @@ configuration script etc), for example::
 .. _environment-setup:
 
 Environment variables
-*********************
+=====================
 
 Note: For other user facing environment variables, see :ref:`using_env_vars`.
 
@@ -345,7 +453,7 @@ Note: For other user facing environment variables, see :ref:`using_env_vars`.
    This variable needs to point to the global OCIO config file, e.g
    ``config.ocio``
 
-.. envvar:: DYLD_LIBRARY_PATH
+.. envvar:: DYLD_LIBRARY_PATH (macOS)
 
     The ``lib/`` folder (containing ``libOpenColorIO.dylib``) must be
     on the ``DYLD_LIBRARY_PATH`` search path, or you will get an error
@@ -356,11 +464,11 @@ Note: For other user facing environment variables, see :ref:`using_env_vars`.
         Reason: image not found
 
     This applies to anything that links against OCIO, including the
-    ``PyOpenColorIO`` Python bindings.
+    ``PyOpenColorIO`` Python binding.
 
-.. envvar:: LD_LIBRARY_PATH
+.. envvar:: LD_LIBRARY_PATH (Linux)
 
-    Equivalent to the ``DYLD_LIBRARY_PATH`` on Linux
+    The Linux equivalent of the macOS ``DYLD_LIBRARY_PATH``.
 
 .. envvar:: PYTHONPATH
 
diff --git a/docs/upgrading_v2/_index.rst b/docs/releases/_index.rst
similarity index 74%
rename from docs/upgrading_v2/_index.rst
rename to docs/releases/_index.rst
index cc45f6ef7a..9d61474e7f 100644
--- a/docs/upgrading_v2/_index.rst
+++ b/docs/releases/_index.rst
@@ -4,10 +4,12 @@
 
 .. _upgrading_to_v2:
 
-Upgrading to v2
-===============
+Releases
+========
 
 .. toctree::
    :caption: Upgrading to v2
 
-   how_to
\ No newline at end of file
+   ocio_2_2
+   ocio_2_1
+   ocio_2_0
diff --git a/docs/upgrading_v2/how_to.rst b/docs/releases/ocio_2_0.rst
similarity index 97%
rename from docs/upgrading_v2/how_to.rst
rename to docs/releases/ocio_2_0.rst
index 0a2ba7f6a0..2892605d03 100644
--- a/docs/upgrading_v2/how_to.rst
+++ b/docs/releases/ocio_2_0.rst
@@ -3,15 +3,15 @@
   Copyright Contributors to the OpenColorIO Project.
 
 
-Introduction
-============
+OCIO 2.0 Release
+================
 
 OpenColorIO v2 is a major update, three years in development, and contains a
 large number of new features.  This section describes the new features and 
 their relevance for both config authors and application developers.
 
 Timeline
-========
+********
 
 OpenColorIO v2 was proposed to the community at SIGGRAPH 2017 and reached 
 "feature complete" at SIGGRAPH 2020.  The official 2.0.0 release was made in 
@@ -19,7 +19,7 @@ January 2021 after a period of stabilization and refinement.  OCIO v2 is in
 the VFX Reference Platform for calendar year 2021.
 
 Demo Config
-===========
+***********
 There is a config file that illustrates many of the new OCIO v2 features
 described below.  This is available in the configs section of the documentation
 as :ref:`ocio-v2_demo`.
@@ -641,3 +641,19 @@ processed.
 
 **Developers**: This may be useful, for example, when displaying data color spaces on
 an HDR monitor.
+
+
+Release Notes
+=============
+
+For more detail, please see the GitHub release pages:
+
+`OCIO 2.0.0 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.0.0>`_
+
+`OCIO 2.0.1 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.0.1>`_
+
+`OCIO 2.0.2 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.0.2>`_
+
+`OCIO 2.0.3 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.0.3>`_
+
+`OCIO 2.0.4 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.0.4>`_
diff --git a/docs/releases/ocio_2_1.rst b/docs/releases/ocio_2_1.rst
new file mode 100644
index 0000000000..5fcf6b4924
--- /dev/null
+++ b/docs/releases/ocio_2_1.rst
@@ -0,0 +1,173 @@
+..
+  SPDX-License-Identifier: CC-BY-4.0
+  Copyright Contributors to the OpenColorIO Project.
+
+
+OCIO 2.1 Release
+================
+
+Timeline
+********
+
+OpenColorIO 2.1 was delivered in August 2021 and is in the VFX Reference Platform for
+calendar year 2022.
+
+New Features
+============
+
+ACES 1.3 Gamut Compression
+**************************
+
+ACES 1.3 introduced a scene-referred gamut compression algorithm that compresses
+colors captured by a camera into the AP1 gamut.  This avoids artifacts that may
+occur downstream (such as in the ACES Output Transforms) for scene elements such 
+as colored lights.  This supersedes the earlier "Blue Light Artifact Fix" LMT.
+
+`ACES RGC User Guide 
+<https://www.dropbox.com/scl/fi/acuu6e626s14zlitpu2w8/ACES-1.3-Reference-Gamut-Compression-User-Guide.paper?dl=0&rlkey=bbbhneq62f6r85cptpcywu08n>`_
+
+`ACES RGC Implementation Guide 
+<https://www.dropbox.com/scl/fi/cousf6tispmegu4fpmpr2/ACES-Gamut-Compression-Implementation-Guide.paper?dl=0&rlkey=ul38rayi7imiiy4wjo3cektb2>`_
+
+The implementation includes a BuiltinTransform for the ACES Reference Gamut Compression.
+Here's what that looks like in Config YAML:: 
+
+    !<BuiltinTransform> {style: ACES-LMT - ACES 1.3 Reference Gamut Compression}
+
+There is also a FixedFunctionTransform for the underlying gamut compression algorithm.
+As described in the links above, it takes seven parameter values:: 
+
+    [ Limit_Cyan, Limit_Magenta, Limit_Yellow, Threshold_Cyan, Threshold_Magenta, Threshold_Yellow, Roll-off ].
+
+Here is how that looks in Config YAML with the parameters set for the ACES 1.3 Reference Gamut
+Compression::
+
+    !<FixedFunctionTransform> {style: ACES_GamutComp13, params: [1.147, 1.264, 1.312, 0.815, 0.803, 0.88, 1.2]}
+
+And here's how it looks in a CTF file::
+
+    <FixedFunction inBitDepth="32f" outBitDepth="32f" style="GamutComp13Fwd" params="1.147 1.264 1.312 0.815 0.803 0.88 1.2" />
+
+**Config authors**: These new transforms may be used in configs with version 2.1 or higher 
+and CTF files with version 2.1 or higher.
+
+
+OpenFX OCIO plug-ins
+********************
+
+A framework to support OpenFX plug-ins has been added and example source code may be found 
+in the vendor/openfx directory.  The initial set consists of two plug-ins, one for applying
+a ColorSpaceTransform and another for applying a DisplayViewTransform.
+
+Some screenshots of the UI may be found in `PR #1371. 
+<https://github.com/AcademySoftwareFoundation/OpenColorIO/pull/1371>`_
+
+**End users**: There are currently no pre-built executables available, so you will need to
+compile these yourself.  
+
+**Developers**: Set the CMake variable ``-D OCIO_BUILD_OPENFX=ON`` to build the plug-ins.
+
+
+Support for PyPI (pip install)
+******************************
+
+Python wheel generation has been added and support for the `Python Package Index (PyPI). 
+<https://pypi.org/project/opencolorio/>`_
+This allows you to easily install the OCIO Python bindings without needing to build from
+source or use one of the OS-specific package managers.  The command is simply::
+
+    pip install opencolorio
+
+
+Support for emitting Open Shading Language (OSL)
+************************************************
+
+The OCIO GPU Renderer may now emit shaders in `Open Shading Language 
+<https://github.com/AcademySoftwareFoundation/OpenShadingLanguage>`_ format.  There is a 
+new GpuLanguage enum: ``LANGUAGE_OSL_1``.  
+
+Note that the OCIO library does not need to be compiled against the OSL library but running 
+the OSL unit tests does require OSL and OIIO.  These will be built automatically if CMake is 
+able to find them or you set ``-D OSL_ROOT`` to indicate the installed location.
+
+Currently, Lut1D and Lut3D Transforms are not supported and will throw an exception if used.
+All other transforms are supported.  However, dynamic properties are currently locked to 
+their initial values and may not be adjusted.
+
+
+Support for Apple Metal Shading Language and OpenGL ES
+******************************************************
+
+The OCIO GPU Renderer may now emit shaders in these other new languages.  The new 
+GpuLanguage enums are: 
+
+* ``GPU_LANGUAGE_MSL_2_0``
+* ``GPU_LANGUAGE_GLSL_ES_3_0``
+* ``GPU_LANGUAGE_GLSL_ES_1_0``
+
+
+Test suite for the Academy/ASC Common LUT Format (CLF)
+******************************************************
+
+The set of CLF test files was expanded and a set of Python scripts was added under 
+``share/clf`` which were developed by the ACES Common LUT Format Implementation Working Group.
+The scripts allow use of the OpenColorIO renderers to process a sample image through each
+CLF file in the test suite and then compare the reference images to a set of corresponding
+images from another CLF implementation to see if the differences are within the tolerance
+allowed by the `CLF Implementation Guide. <https://docs.acescentral.com/guides/clf/>`_
+
+
+Imath 3 support
+***************
+
+OCIO and its CI system now support Imath 3 for the Half data type dependency.
+
+
+Noteworthy API updates
+**********************
+
+getColorSpaceFromFilePath
++++++++++++++++++++++++++
+
+``Config::getColorSpaceFromFilepath()`` is now the proper way to extract a color space from
+a file path, regardless of whether the config is version 1, or version 2 or higher.  It 
+works regardless of whether the config has FileRules defined.  Therefore, the older
+``Config::parseColorSpaceFromString()`` method (which does not work with FileRules) is 
+officially deprecated.
+
+Note that ``Config::getColorSpaceFromFilepath()`` always returns a color space, regardless
+of whether the config's ``strictparsing`` attribute is true or false.   This was a request from 
+app developers that always need a default (which is a common scenario).  Please see 
+`Issue #1398 <https://github.com/AcademySoftwareFoundation/OpenColorIO/issues/1398>`_ 
+for more details on how to take special action if ``strictparsing`` is true.
+
+
+getDefaultView
+++++++++++++++
+There is a new ``Config::getDefaultView(display, colorspaceName)`` method that takes
+advantage of the ViewingRules feature that was introduced in OCIO 2.0.  It returns the
+default view that is most appropriate for a given color space.  If the color space is
+known, developers should call this rather than ``Config::getDefaultView(display)``.
+
+The motivation is that the best view transform to use varies with the color space.  For
+example, a scene-referred color space typically wants some kind of tone-map such as an
+ACES Output Transform.  However, a display-referred color space will look wrong with such
+a view transform since it has already been tone-mapped.
+
+One scenario where this new method is ideal is when an application needs to generate 
+thumbnails for a large number of images that are in a variety of color spaces.  Note that
+the config must have the Viewing Rules set in order to return different views, but if the
+rules are not set, it will still return the same result as ``Config::getDefaultView(display)``,
+so it may be used with either v1 or v2 configs.
+
+
+Release Notes
+=============
+
+For more detail, please see the GitHub release pages:
+
+`OCIO 2.1.0 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.1.0>`_
+
+`OCIO 2.1.1 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.1.1>`_
+
+`OCIO 2.1.2 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.1.2>`_
diff --git a/docs/releases/ocio_2_2.rst b/docs/releases/ocio_2_2.rst
new file mode 100644
index 0000000000..beb5624017
--- /dev/null
+++ b/docs/releases/ocio_2_2.rst
@@ -0,0 +1,395 @@
+..
+  SPDX-License-Identifier: CC-BY-4.0
+  Copyright Contributors to the OpenColorIO Project.
+
+
+OCIO 2.2 Release
+================
+
+Timeline
+********
+
+OpenColorIO 2.2 was delivered in October 2022 and is in the VFX Reference Platform for
+calendar year 2023.
+
+
+New Features
+============
+
+Built-in Configs
+****************
+
+The OCIO Configs Working Group has put a lot of effort into building new ACES configs that
+take advantage of the features of OCIO v2 and that are informed by the learnings from the 
+previous ACES configs.  The new configs are available from the project's GitHub repo: 
+`OpenColorIO-Config-ACES 
+<https://github.com/AcademySoftwareFoundation/OpenColorIO-Config-ACES/releases/tag/v1.0.0>`_
+
+To make them easily accessible, they are built in to the library itself and may be
+accessed directly from within applications that incorporate the OCIO 2.2 library. 
+
+For Users
++++++++++
+
+Wherever you are able to provide a file path to a config, you may now provide a URI-type 
+string to instead use one of the built-in configs. For example, you may use these strings 
+for the OCIO environment variable.
+
+To use the :ref:`aces_cg`, use this string for the config path::
+    ocio://cg-config-v1.0.0_aces-v1.3_ocio-v2.1
+
+To use the :ref:`aces_studio`, use this string for the config path::
+    ocio://studio-config-v1.0.0_aces-v1.3_ocio-v2.1
+
+This string will give you the current default config, which is currently the ACES CG Config::
+    ocio://default
+
+In future releases, it is expected that updated versions of these configs will be added, 
+each with a unique name string. However, the previous strings will continue to be 
+supported for backwards compatibility.
+
+For Developers
+++++++++++++++
+
+The new URI-type strings may be passed into the existing APIs such as ``CreateFromEnv`` 
+and ``CreateFromFile``.  There is also a new ``CreateFromBuiltinConfig``.  No changes 
+should be needed, but please test that users of your application may use a path containing 
+a URI-type string rather than a path to a config.ocio file.
+
+If you'd like to offer the built-in configs from your user interface, there is an API to 
+access them through the new ``BuiltinConfigRegistry``.  Here is basic Python code to 
+access the registry and print the strings to present in a UI::
+
+    registry = ocio.BuiltinConfigRegistry().getBuiltinConfigs()
+    for item in registry:
+        # The short_name is the URI-style name.
+        # The ui_name is the name to use in a user interface.
+        short_name, ui_name, isRecommended, isDefault = item
+
+        # Don't present built-in configs to users if they are no longer recommended.
+        if isRecommended:
+            print(ui_name)
+
+Prints:
+
+    Academy Color Encoding System - CG Config [COLORSPACES v1.0.0] [ACES v1.3] [OCIO v2.1]
+
+    Academy Color Encoding System - Studio Config [COLORSPACES v1.0.0] [ACES v1.3] [OCIO v2.1]
+
+If your application saves the path to a config and the user enters ``ocio://default``, it 
+is recommended that you don't save that as is.  Instead, call 
+``getDefaultBuiltinConfigName`` to get the name of the current default.  This is 
+guaranteed to give the same results, whereas the default config could change somewhat 
+between releases.  Prepend ``ocio://`` in order to save it as a valid config path.  Here's 
+the Python code::
+
+    "ocio://" + ocio.BuiltinConfigRegistry().getDefaultBuiltinConfigName()
+
+
+Config Archiving
+****************
+
+An OCIO config, including its external LUT files, may now be packaged into a single file.  
+This makes it easier to distribute configs and may slightly discourage tampering with the 
+contents.  An archived config may be created with the new ``ocioarchive`` command-line 
+tool and is stored in a file with the extension ``.ocioz``.  It is basically a compressed 
+ZIP archive of the config and its LUTs.
+
+To be archivable, the external LUT files must be contained within (or below) the config's 
+working directory. (In OCIO terminology, the "working directory" is the directory that 
+contains the config.ocio file.)
+
+A config is not archivable if any of the following are true:
+
+* It contains FileTransforms with a src outside the working directory
+* The search path contains paths outside the working directory
+* The search path contains paths that start with a context variable
+* The working directory is not set (if archiving a Config object)
+
+Context variables are allowed but the intent is that they may only resolve to paths that
+are within or below the working directory.  This is because the archiving function will
+only archive files that are within the working directory in order to ensure that if it is
+later expanded, that it will not create any files outside this directory.
+
+For example, a context variable on the search path intended to contain the name of a 
+sub-directory under the working directory must have the form ``./$DIR_NAME`` rather than 
+just ``$DIR_NAME`` to be considered archivable. This is imperfect since there is no way to
+prevent the context variable from creating a path outside the working dir, but it should
+at least draw attention to the fact that the archive would fail if used with context vars
+that try to abuse the intended functionality.
+
+All files within or below the working directory that end with an extension corresponding 
+to a LUT type supported by OCIO are added to the archive.  This approach was taken since 
+the goal was for the archive to work with any combination of context variables that the 
+original config works with.  Files with other extensions or no extension are not archived.
+
+
+For Users
++++++++++
+
+Wherever you are able to provide a file path to a config, you may now provide a path to a 
+.ocioz file. For example, you may use this for the OCIO environment variable.
+
+The ``ocioarchive`` command-line tool will allow you to archive a config or to expand an 
+existing archive.  It will also allow you to list the contents of an archived config.  Run 
+``ocioarchive --help`` to get a print-out of the various options.
+
+The print out from the ``ociocheck`` command-line tool now includes a new line-item 
+indicating whether the config is archivable.
+
+For Developers
+++++++++++++++
+
+The new .ocioz files may be passed into the existing APIs such as ``CreateFromEnv`` and 
+``CreateFromFile``.  No changes should be needed, but please test that users of your 
+application may use a path containing a .ocioz file rather than a path to a config.ocio 
+file.
+
+The Config class has new ``isArchivable`` and ``archive`` methods.  There is also an 
+``ExtractOCIOZArchive`` function.
+
+
+Abstracting the Source of External LUT Files
+********************************************
+
+The new ConfigIOProxy class allows the calling program to supply the config and any 
+associated LUT files directly, without relying on the standard file system.  This opens 
+the door to expanded ways in which OCIO may be used.
+
+The new config archiving feature was implemented using this mechanism.
+
+For Developers
+++++++++++++++
+
+Please refer to the ``ConfigIOProxy`` class.  By implementing the ``getLutData``, 
+``getConfigData``, and ``getFastLutFileHash`` methods, you have control over how the 
+config is provided to OCIO.  No file system access to a config is required.
+
+The ``CreateFromConfigIOProxy`` factory allows for the creation of a Config object from a 
+ConfigIOProxy object.
+
+
+Converting To or From a Known Color Space
+*****************************************
+
+An OCIO config defines its own self-contained universe of color spaces.  But there are not 
+any requirements for color spaces which must always be included or how they must be named.  
+This poses difficulties for many applications which need to convert to or from certain 
+known standard color spaces.  For example, a renderer might have a physical sun and sky 
+model which produces colors in a CIE space and it needs to convert those into the 
+rendering space defined by a user's custom OCIO config.  Or an application may use an SDK 
+to debayer images from a digital cinema camera.  The SDK produces images in a specific 
+color space which then needs to be processed into something viewable through a user's 
+custom OCIO config.
+
+For Developers
+++++++++++++++
+
+OCIO v2 introduced the Interchange Roles to help address this problem but these had 
+previously been optional and are unlikely to be included in OCIO v1 configs (although it 
+would be perfectly legal to add them).
+
+OCIO 2.2 introduces the new functions ``GetProcessorToBuiltinColorSpace`` and 
+``GetProcessorFromBuiltinColorSpace`` that will allow you to convert to or from any of the 
+color spaces in the built-in Default config (this is currently the ACES CG config 
+described above).  This built-in config includes common spaces such as "Linear Rec.709 
+(sRGB)", "sRGB - Texture", "ACEScg", and "ACES2065-1".
+
+If the source config defines the necessary Interchange Role (typically 
+``aces_interchange``), then the conversion will be well-defined and equivalent to calling 
+``GetProcessorFromConfigs`` with the source config and the Built-in config
+
+However, if the Interchange Roles are not present, heuristics will be used to try and 
+identify a common color space in the source config that may be used to allow the 
+conversion to proceed. If the heuristics fail to find a suitable space, an exception is 
+thrown. The heuristics may evolve, so the results returned by this function for a given 
+source config and color space may change in future releases of the library. However, the 
+Interchange Roles are required in config versions 2.2 and higher, so it is hoped that the 
+need for the heuristics will decrease over time.
+
+The current heuristics should work on any config (including an OCIO v1 config) that was 
+generated by editing one of the ACES configs or any config that uses one of the following 
+as its reference space:
+
+* ACES2065-1
+* ACEScg
+* Scene-linear Rec.709 (sRGB)
+* Scene-linear Rec.2020
+* Scene-linear P3-D65
+
+And that has a color space either for any of the above spaces or for an sRGB texture space 
+that has "sRGB" (case-insensitive) in its color space name or one of its aliases.
+
+Note that the heuristics create a Processor and evaluate color values that must match 
+within a certain tolerance.  No color space is selected purely based on its name alone.  
+If the heuristics fail to find a recognized color space, an exception is thrown.
+
+
+Making the interchange roles required for config versions 2.2 or higher
+***********************************************************************
+
+For Users
++++++++++
+
+Users were surveyed during the OCIO 2.2 development process as to whether the Interchange 
+Roles should become mandatory.  The response was overwhelmingly in favor of doing this, 
+largely because it allows robust interchange of color spaces between configs or to 
+external known standard color spaces.
+
+Therefore, as described in the previous section, for config files of version 2.2 or 
+higher, it is mandatory to define the ``aces_interchange`` role.  If the config includes 
+display color spaces, the ``cie_xyz_d65_interchange`` role is also required.  
+
+Note that the ``cie_xyz_d65_interchange`` is only used in connection with display color 
+spaces (that is, with the display-referred connection space).  It is not used for 
+scene-referred color spaces, and indeed it is an error if a scene-referred space is 
+assigned to that role.
+
+The ``ociocheck`` command-line tool has been updated to make these checks.  In addition, 
+its reporting on other roles has been modified to be more lenient regarding roles which 
+are no longer considered essential.
+
+For Developers
+++++++++++++++
+
+The Config::validate method will log an error if the Config object does not meet these 
+requirements.  Note that an exception is not thrown since it was felt that the Config's 
+``upgradeToLatestVersion`` method must always produce a valid config.
+
+
+Determining if a Color Space is Linear
+**************************************
+
+There have been many requests from developers that would like a standard way to determine 
+if a color space is linear, since this impacts what sort of processing is suitable.  OCIO 
+v2 introduced a new ``encoding`` attribute for color spaces which contains this 
+information.  However, this is optional and may not be set by all config authors.  And it 
+won't be present in OCIO v1 configs, which are still widely used.
+
+For Developers
+++++++++++++++
+
+OCIO 2.2 adds a new ``isColorSpaceLinear`` method to the Config class which may be used 
+for this purpose.  
+
+Note that since OCIO has both a scene-referred and a display-referred reference space, the 
+method also takes a ReferenceSpaceType enum to indicate which reference space the 
+linearity determination is with respect to.  Typically developers will set this to 
+``REFERENCE_SPACE_SCENE``.
+
+The following algorithm is used to make the determination:
+
+* If the color space ``isdata`` attribute is true, return false.
+* If the reference space type of the color space differs from the requested reference 
+space type, return false.
+* If the color space's encoding attribute is present, return true if it matches the 
+expected reference space type (i.e., "scene-linear" for ``REFERENCE_SPACE_SCENE`` or 
+"display-linear" for ``REFERENCE_SPACE_DISPLAY``) and false otherwise.
+* If the color space has no ``to_reference`` or ``from_reference`` transform, return true.
+* Evaluate several points through the color space's transform and check if the output only 
+differs by a scale factor (which may be different per channel, e.g. allowing an arbitrary 
+matrix transform, with no offset).
+
+Note that the last step is a heuristic that may or may not be accurate.  However, note 
+that the ``encoding`` attribute takes precedence and so config authors have the ultimate 
+control over the linearity determination.
+
+
+Getting a Processor for a NamedTransform
+****************************************
+
+For Developers
+++++++++++++++
+
+A new config object was introduced in OCIO v2 called Named Transforms.  These are used 
+when there is a need to apply a mathematical function which is not a conversion between 
+two specific color spaces.  The most common example is applying a transfer function curve 
+to convert linear data to non-linear, or vice-versa.
+
+The new ACES configs include Named Transforms, so it is important for application 
+developers to start supporting this type of config object.  The preferred method for doing 
+so is to add a new tool, similar to FileTransform that applies a Named Transform.  
+
+What is new in OCIO 2.2 is that the code for applying these is now simpler with the 
+introduction of several new getProcessor methods on the Config class that will return a 
+Processor directly from a NamedTransform object.  
+
+In addition, the NamedTransform class has a GetTransform method that returns a (regular) 
+Transform object for a given direction.  It will create the transform from the inverse 
+direction if the transform for the requested direction is missing.
+
+
+Circular OCIO / OIIO Build Dependency Solution
+**********************************************
+
+A long-standing complaint has been regarding the circular build dependency between OCIO 
+and OpenImageIO.  This is due to the fact that OIIO wants to use OCIO for color management 
+and OCIO wants to use OIIO in its command-line tools ``ocioconvert``, ``ociolutimage``, 
+and ``ociodisplay`` for reading and writing image files.  These tools will not be built if
+OIIO is not available when configuring the build.
+
+Furthermore, some package installers will not install these command-line tools due to the
+dependency on OIIO.
+
+By default, OCIO will now build these tools with OpenEXR rather than relying on OIIO.
+
+For Users
++++++++++
+
+If you have a version of OCIO that was not compiled with tools such as ``ocioconvert`` and 
+you want to use OCIO to process images, you could try using OpenImageIO's ``oiiotool``.  
+(Although note that ``ocioconvert`` has a few features that are not in ``oiiotool``, such 
+as GPU processing support.)  Similarly if you have ``ocioconvert``, but it is compiled 
+with OpenEXR rather than OpenImageIO, you may use ``oiiotool`` to convert other image file 
+formats to/from OpenEXR.
+
+If you want to use ``oiiotool`` but it does not support a particular type of conversion, 
+you may be able to use ``ociowrite`` to export a CTF file and then use that with the 
+``--ociofiletransform`` option in ``oiiotool``.
+
+For Developers
+++++++++++++++
+
+In OCIO 2.2, by default, the build will now use OpenEXR rather than OpenImageIO for the 
+command-line tools that read or write images.  This will limit the functionality of the 
+aforementioned command-line tools to only working with OpenEXR files.  If you want support 
+for more file formats in these tools, you will still need to have OIIO available when 
+building OCIO and set the CMake variable ``-D OCIO_USE_OIIO_FOR_APPS=ON``.
+
+
+Miscellaneous Improvements
+**************************
+
+Here are some other improvements in OCIO 2.2:
+
+* Support for more types of ICC Monitor Profiles -- All of the parametric curve types are 
+now supported.
+
+* New hash function for calculating cache IDs -- The md5 algorithm has been replaced with 
+xxhash, which provides a considerable speed-up for various operations.  The APIs that 
+return cache ID strings will obviously return different strings now, but please note that 
+these are not guaranteed to be unchanged across releases.  (The 128-bit version of xxhash 
+was used, which is the same length as for md5.)
+
+* The command-line tools ``ocioconvert``, ``ociowrite``, and ``ocioperf`` now support 
+using an inverse DisplayViewTransform.
+
+* Add DisplayViewTransform and NamedTransform support to Baker.
+
+* Several new Built-in Transforms are available for version 2.2 config files, including 
+ARRI LogC4.
+
+* Preliminary support for ACES Metadata File (AMF) -- A prototype Python tool has been 
+added named ``pyocioamf`` that converts an AMF file into the OCIO native transform format 
+CTF. It uses a prototype ACES Reference config file that is serving as a database of ACES 
+Transform IDs for interpreting the AMF file. 
+
+* Support for PyPI installation from source rather than pre-built binaries.
+
+
+Release Notes
+=============
+
+For more detail, please see the GitHub release pages:
+
+`OCIO 2.2.0 <https://github.com/AcademySoftwareFoundation/OpenColorIO/releases/tag/v2.2.0>`_
diff --git a/docs/toc_redirect.rst b/docs/toc_redirect.rst
index 3785c12f92..2dd5537956 100644
--- a/docs/toc_redirect.rst
+++ b/docs/toc_redirect.rst
@@ -30,7 +30,7 @@
 .. toctree::
    :hidden:
 
-   upgrading_v2/_index
+   releases/_index
 
 .. toctree::
    :hidden:
diff --git a/include/OpenColorIO/OpenColorIO.h b/include/OpenColorIO/OpenColorIO.h
index 514fcc3db0..a7d073f788 100644
--- a/include/OpenColorIO/OpenColorIO.h
+++ b/include/OpenColorIO/OpenColorIO.h
@@ -459,14 +459,25 @@ class OCIOEXPORT Config
     const char * getEnvironmentVarDefault(const char * name) const;
     void clearEnvironmentVars();
 
+    /**
+     * \brief The EnvironmentMode controls the behavior of loadEnvironment.
+     *  * ENV_ENVIRONMENT_LOAD_PREDEFINED - Only update vars already added to the Context.
+     *  * ENV_ENVIRONMENT_LOAD_ALL        - Load all env. vars into the Context.
+     *
+     * \note Loading ALL the env. vars may reduce performance and reduce cache efficiency.
+     *
+     * Client programs generally will not use these methods because the EnvironmentMode is
+     * set automatically when a Config is loaded.  If the Config has an "environment"
+     * section, the mode is set to LOAD_PREDEFINED, and otherwise set to LOAD_ALL.
+     */
     void setEnvironmentMode(EnvironmentMode mode) noexcept;
     EnvironmentMode getEnvironmentMode() const noexcept;
+    /// Initialize the environment/context variables in the Config's Context.
     void loadEnvironment() noexcept;
 
     const char * getSearchPath() const;
     /**
-     * \brief Set all search paths as a concatenated string, ':' to separate the
-     *     paths. 
+     * \brief Set all search paths as a concatenated string, use ':' to separate the paths. 
      * 
      * See \ref addSearchPath for a more robust and platform-agnostic method of
      * setting the search paths.
@@ -3442,8 +3453,9 @@ class OCIOEXPORT Context
     void setWorkingDir(const char * dirname);
     const char * getWorkingDir() const;
 
-    /// Add (or update) a context variable. But it removes it if the value argument
-    /// is null.
+    /// Add (or update) a context variable. But it removes it if the value argument is null.
+    /// Note that a Context StringVar is the same thing as a Config EnvironmentVar and these
+    /// are both often referred to as a "context var".
     void setStringVar(const char * name, const char * value) noexcept;
     /// Get the context variable value. It returns an empty string if the context 
     /// variable is null or does not exist.
@@ -3459,13 +3471,11 @@ class OCIOEXPORT Context
     /// Add to the instance all the context variables from ctx.
     void addStringVars(const ConstContextRcPtr & ctx) noexcept;
 
-    
+    /// See \ref Config::setEnvironmentMode.
     void setEnvironmentMode(EnvironmentMode mode) noexcept;
-
-    
     EnvironmentMode getEnvironmentMode() const noexcept;
 
-    /// Seed all string vars with the current environment.
+    /// Seed string vars with the current environment, based on the EnvironmentMode setting.
     void loadEnvironment() noexcept;
 
     /// Resolve all the context variables from the string. It could be color space
diff --git a/include/OpenColorIO/OpenColorTypes.h b/include/OpenColorIO/OpenColorTypes.h
index 9976ac821d..c400568eaa 100644
--- a/include/OpenColorIO/OpenColorTypes.h
+++ b/include/OpenColorIO/OpenColorTypes.h
@@ -456,11 +456,12 @@ enum GpuLanguage
     GPU_LANGUAGE_MSL_2_0            ///< Metal Shading Language
 };
 
+/// Controls which environment variables are loaded into a Context object.
 enum EnvironmentMode
 {
     ENV_ENVIRONMENT_UNKNOWN = 0,
-    ENV_ENVIRONMENT_LOAD_PREDEFINED,
-    ENV_ENVIRONMENT_LOAD_ALL
+    ENV_ENVIRONMENT_LOAD_PREDEFINED, ///< Only load vars in the config's environment section
+    ENV_ENVIRONMENT_LOAD_ALL         ///< Load all env. vars (note: may reduce performance)
 };
 
 /// A RangeTransform may be set to clamp the values, or not.

From 794acce0ffc291dd67e4d1f9a9302abef3197cf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Thu, 5 Jan 2023 10:26:45 -0500
Subject: [PATCH 46/81] Adsk Contrib - OCIO cmake improvements (#1736)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* - Refactoring how OCIO search for minizip-ng. The first step is to search for an external minizip-ng. If not found, search for minizip-ng with MZ_COMPAT=ON (libminizip). If it is not found either, download and install minizip-ng with MZ_COMPAT=OFF.
- Removing the minizip-ng part for the includes for minizip-ng headers.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Update comments

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Improved find_package in Config mode (adding it back)
Added missing scripts to install minizip-ng and zlib for the analysis workflow

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding +x permissions for install_minizip_ng and zlib
Fixing path to find zlib in install_minizip-ng.sh

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Changing target name to match the one used by minizip-ng library (+ using the imported target instead of creating a new one when minizip-ng is found)

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* First pass for the OpenColorIOConfig.cmake file with the required dependencies only.
A few extra fixes for OpenEXR, ZLIB and Minizip-ng.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding a informative message when building static ocio instead of per module.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding an extra step to test the consumer app with static OpenColorIO for Windows, Linux and MacOS.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Re-using the same test instead of creating a new one by removing the condition on build-shared.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fixing spacing, typo and adding back the NOT in the condition. (it was removed for debugging purpose)

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Changing directory where we share OCIO custom find modules (now in <install dir>/share/cmake/modules).
Adding a cmake macro when installing OCIO since it is needed by some custom find modules.
The find modules are only installed when building OCIO as a static library.
The config.cmake.in files now only looks for the dependency when OCIO was built as a static library.
Tentative (ci-workflow): Adding cmake-consumer test for static builds.
Overhaul of the Findzlib module to be more inline with the FindZLIB from cmake and to be more robust.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding missing backward slashes and saving the build path in an existing step instead of creating a new step.
Adding a check for CMake version for a section in Findzlib.cmake.
Re-phrasing some comments.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Prevent the download of the dependencies in the scenario where static OCIO is linked to a consumer project. Since OCIO_INSTALL_EXT_PACKAGES is not defined, our find module tries to download the dependencies, but we don't want that mecanism for consumer project.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Changing path where OCIO install its own custom find module.
Removing custom Findzlib and making modifications to use CMake FindZLIB.
Created a InstallZLIB module which does the download and install part if OCIO_INSTALL_EXT_PACAKGES is ALL or MISSING.
Tweaked config.cmake.in to use CMake FindZLIB.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding more details in comments

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing an extra "shell" property in CI workflow.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fix issues discovered with the failing CI workflow:

Cmake-consumer test now prefer the static version of the dependencies.
Fix an issue where yaml-cpp is not found by a consumer app (variable spelled incorectly in Findyaml-cpp).
Fix an issue expat library is not found by a consumer app on Windows.
Adding support for OCIO's  <pkg_name>_STATIC_LIBRARY for ZLIB while supporting ZLIB_USE_STATIC_LIBS (CMake 3.24+) from CMake's FindZLIB.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing pystring_STATIC_LIBRARY does not exists in CI workflow.
Detecting if the build is Debug in the find module that OCIO installs since they can't rely on variables set by OCIO CMakefiles.
Fix issue with yaml-cpp library naming in debug.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fix issues on Windows with expat and minizip-ng library naming when building static OCIO.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Added option for macOS CI job in order to get more info on a failed job. Will be reverted once the issue is found.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fixing a issue on macOS.
Improving ZLIB usage comments.
Bumping minizip-ng to the latest version - 3.0.7.
Bumping ZLIB to the latest version 2.1.13 to fix a vulnerability (CVE-2022-37434)

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Proposing to remove Findminizip module since external minizip-ng build need to be done with the same option as the internal build.
Otherwise, linking and symbols issues are going to happend if the other libraries are not linked in correctly.
It is going to simplify the maintainability of OCIO.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing findminizip.cmake from the install since it does not exist anymore.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Signed-off-by: Cédrik Fuoco <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .github/workflows/ci_workflow.yml             |  95 +++++-
 CMakeLists.txt                                |  29 ++
 share/cmake/modules/FindExtPackages.cmake     | 106 ++++++-
 share/cmake/modules/FindImath.cmake           |  11 +-
 share/cmake/modules/FindOpenEXR.cmake         |  17 +-
 share/cmake/modules/Findexpat.cmake           |  60 +++-
 share/cmake/modules/Findlcms2.cmake           |   2 +-
 share/cmake/modules/Findminizip-ng.cmake      | 270 +++++++++---------
 share/cmake/modules/Findminizip.cmake         | 172 -----------
 share/cmake/modules/Findopenfx.cmake          |   2 +-
 share/cmake/modules/Findpybind11.cmake        |   2 +-
 share/cmake/modules/Findpystring.cmake        |  10 +-
 share/cmake/modules/Findyaml-cpp.cmake        |  38 +--
 .../{Findzlib.cmake => InstallZLIB.cmake}     |  78 ++---
 src/cmake/Config.cmake.in                     | 123 ++++++++
 15 files changed, 611 insertions(+), 404 deletions(-)
 delete mode 100644 share/cmake/modules/Findminizip.cmake
 rename share/cmake/modules/{Findzlib.cmake => InstallZLIB.cmake} (73%)

diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
index 890021741b..77527cd723 100644
--- a/.github/workflows/ci_workflow.yml
+++ b/.github/workflows/ci_workflow.yml
@@ -287,11 +287,12 @@ jobs:
                 --target install \
                 --config ${{ matrix.build-type }} \
                 -- -j$(nproc)
+          echo "ocio_build_path=$(pwd)" >> $GITHUB_ENV
         working-directory: _build
       - name: Test
         run: ctest -V -C ${{ matrix.build-type }}
         working-directory: _build
-      - name: Test CMake Consumer
+      - name: Test CMake Consumer with shared OCIO
         if: matrix.build-shared == 'ON'
         run: |
           cmake . \
@@ -301,6 +302,34 @@ jobs:
                 --config ${{ matrix.build-type }}
           ./consumer
         working-directory: _build/tests/cmake-consumer-dist
+      - name: Test CMake Consumer with static OCIO
+        if: matrix.build-shared == 'OFF'
+        # The yaml-cpp_VERSION is set below because Findyaml-cpp.cmake needs it but is unable to 
+        # extract it from the headers, like the other modules.
+        #
+        # Prefer the static version of each dependencies by using <pkg>_STATIC_LIBRARY. 
+        # Alternatively, this can be done by setting <pkg>_LIBRARY and <pkg>_INCLUDE_DIR to 
+        # the static version of the package.
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -Dexpat_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dexpat_STATIC_LIBRARY=ON \
+                -DImath_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DImath_STATIC_LIBRARY=ON \
+                -Dpystring_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_STATIC_LIBRARY=ON \
+                -Dyaml-cpp_VERSION=0.7.0 \
+                -DZLIB_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DZLIB_STATIC_LIBRARY=ON \
+                -Dminizip-ng_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dminizip-ng_STATIC_LIBRARY=ON
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
 
   # ---------------------------------------------------------------------------
   # macOS
@@ -424,11 +453,12 @@ jobs:
                 --target install \
                 --config ${{ matrix.build-type }} \
                 -- -j$(sysctl -n hw.ncpu)
+          echo "ocio_build_path=$(pwd)" >> $GITHUB_ENV
         working-directory: _build
       - name: Test
         run: ctest -V -C ${{ matrix.build-type }}
         working-directory: _build
-      - name: Test CMake Consumer
+      - name: Test CMake Consumer with shared OCIO
         if: matrix.build-shared == 'ON'
         run: |
           cmake . \
@@ -438,6 +468,34 @@ jobs:
                 --config ${{ matrix.build-type }}
           ./consumer
         working-directory: _build/tests/cmake-consumer-dist
+      - name: Test CMake Consumer with static OCIO
+        if: matrix.build-shared == 'OFF'
+        # The yaml-cpp_VERSION is set below because Findyaml-cpp.cmake needs it but is unable to 
+        # extract it from the headers, like the other modules.
+        #
+        # Prefer the static version of each dependencies by using <pkg>_STATIC_LIBRARY. 
+        # Alternatively, this can be done by setting <pkg>_LIBRARY and <pkg>_INCLUDE_DIR to 
+        # the static version of the package.       
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -Dexpat_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dexpat_STATIC_LIBRARY=ON \
+                -DImath_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DImath_STATIC_LIBRARY=ON \
+                -Dpystring_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_STATIC_LIBRARY=ON \
+                -Dyaml-cpp_VERSION=0.7.0 \
+                -DZLIB_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DZLIB_STATIC_LIBRARY=ON \
+                -Dminizip-ng_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dminizip-ng_STATIC_LIBRARY=ON
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
 
   # ---------------------------------------------------------------------------
   # Windows
@@ -567,13 +625,14 @@ jobs:
           cmake --build . \
                 --target install \
                 --config ${{ matrix.build-type }}
+          echo "ocio_build_path=$(pwd)" >> $GITHUB_ENV
         shell: bash
         working-directory: _build
       - name: Test
         run: ctest -V -C ${{ matrix.build-type }}
         shell: bash
         working-directory: _build
-      - name: Test CMake Consumer
+      - name: Test CMake Consumer with shared OCIO
         if: matrix.build-shared == 'ON'
         run: |
           cmake . \
@@ -585,3 +644,33 @@ jobs:
           ./${{ matrix.build-type }}/consumer
         shell: bash
         working-directory: _build/tests/cmake-consumer-dist
+      - name: Test CMake Consumer with static OCIO
+        if: matrix.build-shared == 'OFF'
+        # The yaml-cpp_VERSION is set below because Findyaml-cpp.cmake needs it but is unable to 
+        # extract it from the headers, like the other modules.
+        #
+        # Prefer the static version of each dependencies by using <pkg>_STATIC_LIBRARY. 
+        # Alternatively, this can be done by setting <pkg>_LIBRARY and <pkg>_INCLUDE_DIR to 
+        # the static version of the package.
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -Dexpat_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dexpat_STATIC_LIBRARY=ON \
+                -DImath_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DImath_STATIC_LIBRARY=ON \
+                -Dpystring_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_STATIC_LIBRARY=ON \
+                -Dyaml-cpp_VERSION=0.7.0 \
+                -DZLIB_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DZLIB_STATIC_LIBRARY=ON \
+                -Dminizip-ng_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dminizip-ng_STATIC_LIBRARY=ON
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          export PATH=../../../_install/bin:$PATH
+          ./${{ matrix.build-type }}/consumer
+        shell: bash
+        working-directory: _build/tests/cmake-consumer-dist
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7ea2b176f..3911a154ce 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -251,6 +251,15 @@ if(OCIO_BUILD_STATIC)
 	endif()
 endif()
 
+if (NOT BUILD_SHARED_LIBS AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+   message(STATUS "Note that building the static version of OpenColorIO does not embed the dependencies\  
+   into the library file. The needed dependencies must be linked to the consumer
+   application or shared library that uses static OpenColorIO.
+
+   The following mandatory dependencies MUST be linked to the consumer application or shared library:
+   expat, yaml-cpp, Imath, pystring, minizip-ng and ZLIB")
+endif()
+
 
 ###############################################################################
 # Find or install external dependencies
@@ -299,6 +308,8 @@ set(OCIO_TARGETS_EXPORT_NAME "${PROJECT_NAME}Targets.cmake")
 set(OCIO_VERSION_CONFIG "${CMAKE_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake")
 set(OCIO_PROJECT_CONFIG "${CMAKE_BINARY_DIR}/${PROJECT_NAME}Config.cmake")
 set(OCIO_CONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+set(OCIO_CUSTOM_FIND_MODULE_DIR "${CMAKE_INSTALL_PREFIX}/share/OpenColorIO/cmake/modules")
+set(OCIO_CUSTOM_MACROS_MODULE_DIR "${CMAKE_INSTALL_PREFIX}/share/OpenColorIO/cmake/macros")
 
 # Version fetched from the top level project()
 write_basic_package_version_file(
@@ -321,6 +332,24 @@ install(
     FILE ${OCIO_TARGETS_EXPORT_NAME}
 )
 
+if (NOT BUILD_SHARED_LIBS)
+    # Install custom macros used in the find modules.
+    install(FILES
+        ${CMAKE_CURRENT_LIST_DIR}/share/cmake/macros/VersionUtils.cmake
+        DESTINATION ${OCIO_CUSTOM_MACROS_MODULE_DIR}
+    )
+
+    # Install custom Find modules.
+    install(FILES
+        ${CMAKE_CURRENT_LIST_DIR}/share/cmake/modules/Findexpat.cmake
+        ${CMAKE_CURRENT_LIST_DIR}/share/cmake/modules/FindImath.cmake
+        ${CMAKE_CURRENT_LIST_DIR}/share/cmake/modules/Findpystring.cmake
+        ${CMAKE_CURRENT_LIST_DIR}/share/cmake/modules/Findminizip-ng.cmake
+        ${CMAKE_CURRENT_LIST_DIR}/share/cmake/modules/Findyaml-cpp.cmake
+        DESTINATION ${OCIO_CUSTOM_FIND_MODULE_DIR}
+    )
+endif()
+
 install(
     FILES "${OCIO_PROJECT_CONFIG}" "${OCIO_VERSION_CONFIG}"
     DESTINATION "${OCIO_CONFIG_INSTALL_DIR}"
diff --git a/share/cmake/modules/FindExtPackages.cmake b/share/cmake/modules/FindExtPackages.cmake
index b777e5bb8f..5455a08ce6 100644
--- a/share/cmake/modules/FindExtPackages.cmake
+++ b/share/cmake/modules/FindExtPackages.cmake
@@ -21,6 +21,17 @@ set(CMAKE_FIND_PACKAGE_NO_PACKAGE_REGISTRY ON CACHE BOOL
 set(CMAKE_FIND_PACKAGE_NO_SYSTEM_PACKAGE_REGISTRY ON CACHE BOOL
     "Disable CMake System Package Registry when finding packages")
 
+if (APPLE)
+    # Store the previous value of CMAKE_FIND_FRAMEWORK and CMAKE_FIND_APPBUNDLE.
+    set(_PREVIOUS_CMAKE_FIND_FRAMEWORK ${CMAKE_FIND_FRAMEWORK})
+    set(_PREVIOUS_CMAKE_FIND_APPBUNDLE ${CMAKE_FIND_APPBUNDLE})
+
+    # Prioritize other paths before Frameworks and Appbundle for find_path, find_library and 
+    # find_package.
+    set(CMAKE_FIND_FRAMEWORK LAST)
+    set(CMAKE_FIND_APPBUNDLE LAST)
+endif()
+
 ###############################################################################
 ### Packages and versions ###
 
@@ -41,14 +52,93 @@ find_package(pystring 1.1.3 REQUIRED)
 set(_Imath_ExternalProject_VERSION "3.1.5")
 find_package(Imath 3.0 REQUIRED)
 
-# ZLIB
-# https://github.com/madler/zlib
-set(_zlib_ExternalProject_VERSION "1.2.12")
-find_package(zlib REQUIRED)
+###############################################################################
+### ZLIB (https://github.com/madler/zlib)
+###
+### The following variables can be set:
+### ZLIB_ROOT               Location of ZLIB library file and includes folder.
+###                         Alternatively, ZLIB_LIBRARY and ZLIB_INCLUDE_DIR can be used.
+###
+### ZLIB_LIBRARY            Location of ZLIB library file.
+### ZLIB_INCLUDE_DIR        Location of ZLIB includes folder.
+###
+### ZLIB_VERSION            ZLIB Version (CMake 3.26+)
+### ZLIB_VERSION_STRING     ZLIB Version (CMake < 3.26)
+###
+###############################################################################
+# ZLIB 1.2.13 is used since it fixes a critical vulnerability.
+# See https://nvd.nist.gov/vuln/detail/CVE-2022-37434
+# See https://github.com/madler/zlib/releases/tag/v1.2.13
+set(_ZLIB_FIND_VERSION "1.2.13")
+set(_ZLIB_ExternalProject_VERSION ${_ZLIB_FIND_VERSION})
+
+if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
+    # ZLIB_USE_STATIC_LIBS is supported only from CMake 3.24+.
+    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") 
+        if (ZLIB_STATIC_LIBRARY)
+            set(ZLIB_USE_STATIC_LIBS "${ZLIB_STATIC_LIBRARY}")
+        endif()
+    else() # For CMake < 3.24 since ZLIB_USE_STATIC_LIBS is not available.
+        if(NOT ZLIB_LIBRARY)
+            if(DEFINED CMAKE_FIND_LIBRARY_PREFIXES)
+                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}")
+            else()
+                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
+            endif()
+
+            if(DEFINED CMAKE_FIND_LIBRARY_SUFFIXES)
+                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_FIND_LIBRARY_SUFFIXES}")
+            else()
+                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
+            endif()
+
+            # Prefix/suffix for windows.
+            if(WIN32)
+                list(APPEND CMAKE_FIND_LIBRARY_PREFIXES "" "lib")
+                list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES ".dll.a")
+            endif()
+
+            # Check if static lib is preferred.
+            if(ZLIB_STATIC_LIBRARY OR ZLIB_USE_STATIC_LIBS)
+                if(WIN32)
+                    set(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+                else()
+                    set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
+                endif()
+            endif()
+        endif()
+    endif()
+
+    set(_ZLIB_REQUIRED REQUIRED)
+    # Override REQUIRED if package can be installed
+    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
+        set(_ZLIB_REQUIRED "")
+    endif()
+
+    find_package(ZLIB ${_ZLIB_FIND_VERSION} ${_ZLIB_REQUIRED})
+
+    # Restore the original find library ordering
+    if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
+        set(CMAKE_FIND_LIBRARY_SUFFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}")
+    else()
+        set(CMAKE_FIND_LIBRARY_SUFFIXES)
+    endif()
+
+    if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
+        set(CMAKE_FIND_LIBRARY_PREFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES}")
+    else()
+        set(CMAKE_FIND_LIBRARY_PREFIXES)
+    endif()
+endif()
+
+if(NOT ZLIB_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(InstallZLIB)
+endif()
+###############################################################################
 
 # minizip-ng
 # https://github.com/zlib-ng/minizip-ng
-find_package(minizip-ng 3.0.6 REQUIRED)
+find_package(minizip-ng 3.0.7 REQUIRED)
 
 if(OCIO_BUILD_APPS)
 
@@ -157,3 +247,9 @@ if(OCIO_BUILD_TESTS)
         message(WARNING "Could NOT find OpenImageIO. Skipping build of the OSL unit tests.")
     endif()
 endif()
+
+if (APPLE)
+    # Restore CMAKE_FIND_FRAMEWORK and CMAKE_FIND_APPBUNDLE values.
+    set(CMAKE_FIND_FRAMEWORK ${_PREVIOUS_CMAKE_FIND_FRAMEWORK})
+    set(CMAKE_FIND_APPBUNDLE ${_PREVIOUS_CMAKE_FIND_APPBUNDLE})
+endif()
diff --git a/share/cmake/modules/FindImath.cmake b/share/cmake/modules/FindImath.cmake
index 1510bbdf70..30f5b4e465 100644
--- a/share/cmake/modules/FindImath.cmake
+++ b/share/cmake/modules/FindImath.cmake
@@ -25,6 +25,15 @@
 ###############################################################################
 ### Try to find package ###
 
+# BUILD_TYPE_DEBUG variable is currently set in one of the OCIO's CMake files. 
+# Now that some OCIO's find module are installed with the library itself (with static build), 
+# a consumer app don't have access to the variables set by an OCIO's CMake files. Therefore, some 
+# OCIO's find modules must detect the build type by itselves. 
+set(BUILD_TYPE_DEBUG OFF)
+if(CMAKE_BUILD_TYPE MATCHES "[Dd][Ee][Bb][Uu][Gg]")
+   set(BUILD_TYPE_DEBUG ON)
+endif()
+
 if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     set(_Imath_REQUIRED_VARS Imath_LIBRARY)
     set(_Imath_LIB_VER "${Imath_FIND_VERSION_MAJOR}_${Imath_FIND_VERSION_MINOR}")
@@ -131,7 +140,7 @@ endif()
 ###############################################################################
 ### Install package from source ###
 
-if(NOT Imath_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT Imath_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
     include(GNUInstallDirs)
 
diff --git a/share/cmake/modules/FindOpenEXR.cmake b/share/cmake/modules/FindOpenEXR.cmake
index 8c14635413..a8b8448fd6 100644
--- a/share/cmake/modules/FindOpenEXR.cmake
+++ b/share/cmake/modules/FindOpenEXR.cmake
@@ -45,6 +45,10 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     if(OpenEXR_FOUND)
         get_target_property(OpenEXR_LIBRARY OpenEXR::OpenEXR LOCATION)
         get_target_property(OpenEXR_INCLUDE_DIR OpenEXR::OpenEXR INTERFACE_INCLUDE_DIRECTORIES)
+        
+        # IMPORTED_GLOBAL property must be set to TRUE since alisasing a non-global imported target
+        # is not possible until CMake 3.18+.
+        set_target_properties(OpenEXR::OpenEXR PROPERTIES IMPORTED_GLOBAL TRUE)
     endif()
 
     # Override REQUIRED if package can be installed
@@ -77,7 +81,7 @@ macro(set_target_location target_name)
     endif()
 endmacro()
 
-if(NOT OpenEXR_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT OpenEXR_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
     include(GNUInstallDirs)
 
@@ -85,13 +89,14 @@ if(NOT OpenEXR_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
 
     # Required dependency
-    # OCIO custom module to find ZLIB. (Findzlib)
-    find_package(zlib)
     if(NOT ZLIB_FOUND)
-        message(STATUS "ZLib is required to build OpenEXR.")
-        return()
+        find_package(ZLIB)
+        if(NOT ZLIB_FOUND)
+            message(STATUS "ZLib is required to build OpenEXR.")
+            return()
+        endif()
     endif()
-
+    
     find_package(Threads)
     if(NOT Threads_FOUND)
         message(STATUS "Threads is required to build OpenEXR.")
diff --git a/share/cmake/modules/Findexpat.cmake b/share/cmake/modules/Findexpat.cmake
index 59e5c2e102..8e23cbba0b 100644
--- a/share/cmake/modules/Findexpat.cmake
+++ b/share/cmake/modules/Findexpat.cmake
@@ -25,6 +25,15 @@
 ###############################################################################
 ### Try to find package ###
 
+# BUILD_TYPE_DEBUG variable is currently set in one of the OCIO's CMake files. 
+# Now that some OCIO's find module are installed with the library itself (with static build), 
+# a consumer app don't have access to the variables set by an OCIO's CMake files. Therefore, some 
+# OCIO's find modules must detect the build type by itselves. 
+set(BUILD_TYPE_DEBUG OFF)
+if(CMAKE_BUILD_TYPE MATCHES "[Dd][Ee][Bb][Uu][Gg]")
+   set(BUILD_TYPE_DEBUG ON)
+endif()
+
 if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     set(_expat_REQUIRED_VARS expat_LIBRARY)
 
@@ -87,21 +96,58 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
                 expat/include
         )
 
+
+        # Expat uses prefix "lib" on all platform by default.
+        # Library name doesn't change in debug.
+
+        # For Windows, see https://github.com/libexpat/libexpat/blob/R_2_2_8/expat/win32/README.txt.
+        # libexpat<postfix=[w][d][MD|MT]>.lib
+        # The "w" indicates the UTF-16 version of the library.
+
         # Lib names to search for
-        set(_expat_LIB_NAMES expat libexpat)
+        set(_expat_LIB_NAMES libexpat expat)
+
         if(WIN32 AND BUILD_TYPE_DEBUG)
-            # Prefer Debug lib names (Windows only)
-            list(INSERT _expat_LIB_NAMES 0 expatd)
+            # Prefer Debug lib names. The library name changes only on Windows.
+            list(INSERT _expat_LIB_NAMES 0 libexpatd libexpatwd expatd expatwd)
+        elseif(WIN32)
+            # libexpat(w).dll/.lib
+            list(APPEND _expat_LIB_NAMES libexpatw expatw)
         endif()
 
         if(expat_STATIC_LIBRARY)
+            # Looking for both "lib" prefix and CMAKE_STATIC_LIBRARY_PREFIX.
             # Prefer static lib names
-            set(_expat_STATIC_LIB_NAMES 
+            set(_expat_STATIC_LIB_NAMES
+                "libexpat${CMAKE_STATIC_LIBRARY_SUFFIX}"
                 "${CMAKE_STATIC_LIBRARY_PREFIX}expat${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
             if(WIN32 AND BUILD_TYPE_DEBUG)
-                # Prefer static Debug lib names (Windows only)
+                # Prefer static Debug lib names. The library name changes only on Windows.
+                list(INSERT _expat_STATIC_LIB_NAMES 0
+                    "libexpatdMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "libexpatdMT${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "libexpatd${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                    "libexpatwdMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "libexpatwdMT${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "libexpatwd${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatdMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatdMT${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatd${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatwdMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatwdMT${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatwd${CMAKE_STATIC_LIBRARY_SUFFIX}")
+            elseif (WIN32)
                 list(INSERT _expat_STATIC_LIB_NAMES 0
-                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatd${CMAKE_STATIC_LIBRARY_SUFFIX}")
+                    "libexpatMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "libexpatMT${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                    "libexpatwMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "libexpatwMT${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatMT${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatwMD${CMAKE_STATIC_LIBRARY_SUFFIX}" 
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}expatwMT${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                    )
             endif()
         endif()
 
@@ -162,7 +208,7 @@ endif()
 ###############################################################################
 ### Install package from source ###
 
-if(NOT expat_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT expat_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
     include(GNUInstallDirs)
 
diff --git a/share/cmake/modules/Findlcms2.cmake b/share/cmake/modules/Findlcms2.cmake
index b7049ce4f4..f35fcf14f4 100644
--- a/share/cmake/modules/Findlcms2.cmake
+++ b/share/cmake/modules/Findlcms2.cmake
@@ -97,7 +97,7 @@ endif()
 ###############################################################################
 ### Install package from source ###
 
-if(NOT lcms2_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT lcms2_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
 
     set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
diff --git a/share/cmake/modules/Findminizip-ng.cmake b/share/cmake/modules/Findminizip-ng.cmake
index 0915ad8b18..b961d1c9ac 100644
--- a/share/cmake/modules/Findminizip-ng.cmake
+++ b/share/cmake/modules/Findminizip-ng.cmake
@@ -3,12 +3,6 @@
 #
 # Locate or install minizip-ng
 #
-# This module will try to do the following:
-# 1) Locate minizip-ng
-# 2) If minizip-ng cannot be found, it will try to find minizip (minizip-ng with MZ_COMPAT=ON)
-# 3) If minizip-ng with MZ_COMPAT=ON cannot be found, minizip-ng will be downloaded, built and
-#    installed.
-#
 # Variables defined by this module:
 #   minizip-ng_FOUND        - If FALSE, do not try to link to minizip-ng
 #   minizip-ng_LIBRARY      - minizip-ng library to link to
@@ -32,6 +26,15 @@
 ###############################################################################
 ### Try to find package ###
 
+# BUILD_TYPE_DEBUG variable is currently set in one of the OCIO's CMake files. 
+# Now that some OCIO's find module are installed with the library itself (with static build), 
+# a consumer app don't have access to the variables set by an OCIO's CMake files. Therefore, some 
+# OCIO's find modules must detect the build type by itselves. 
+set(BUILD_TYPE_DEBUG OFF)
+if(CMAKE_BUILD_TYPE MATCHES "[Dd][Ee][Bb][Uu][Gg]")
+   set(BUILD_TYPE_DEBUG ON)
+endif()
+
 if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     if(NOT DEFINED minizip-ng_ROOT)
         # Search for minizip-ng-config.cmake
@@ -106,23 +109,22 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
                 minizip-ng/include
         )
 
-        # Lib names to search for
-        set(_minizip-ng_LIB_NAMES minizip-ng)
+        # Minizip-ng uses prefix "lib" on all platform by default.
+        # Library name doesn't change in debug.
 
-        if(BUILD_TYPE_DEBUG)
-            # Prefer Debug lib names (Windows only)
-            list(INSERT _minizip-ng_LIB_NAMES 0 minizip-ngd)
-        endif()
+        # Lib names to search for.
+        # Search for "minizip" since the library could be named "minizip" if it was built 
+        # with MZ_COMPAT=ON.
+        set(_minizip-ng_LIB_NAMES libminizip-ng minizip-ng libminizip minizip)
 
         if(minizip-ng_STATIC_LIBRARY)
-            # Prefer static lib names
+            # Looking for both "lib" prefix and CMAKE_STATIC_LIBRARY_PREFIX.
+            # Prefer static lib names.
             set(_minizip-ng_STATIC_LIB_NAMES 
-                "${CMAKE_STATIC_LIBRARY_PREFIX}minizip-ng${CMAKE_STATIC_LIBRARY_SUFFIX}")
-            if(WIN32 AND BUILD_TYPE_DEBUG)
-                # Prefer static Debug lib names (Windows only)
-                list(INSERT _minizip-ng_STATIC_LIB_NAMES 0
-                    "${CMAKE_STATIC_LIBRARY_PREFIX}minizip-ngd${CMAKE_STATIC_LIBRARY_SUFFIX}")
-            endif()
+                "libminizip-ng${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                "${CMAKE_STATIC_LIBRARY_PREFIX}minizip-ng${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                "libminizip${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                "${CMAKE_STATIC_LIBRARY_PREFIX}minizip${CMAKE_STATIC_LIBRARY_SUFFIX}")
         endif()
 
         # Find library
@@ -138,7 +140,23 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
         )
 
         # Get version from header or pkg-config
-        set(minizip-ng_VERSION "${minizip-ng_FIND_VERSION}")
+        if(minizip-ng_INCLUDE_DIR)
+            list(GET minizip-ng_INCLUDE_DIR 0 _minizip-ng_INCLUDE_DIR)
+            if(EXISTS "${_minizip-ng_INCLUDE_DIR}/mz.h")
+                set(_minizip-ng_CONFIG "${_minizip-ng_INCLUDE_DIR}/mz.h")
+            endif()
+        endif()
+
+        if(_minizip-ng_CONFIG)
+            file(STRINGS "${_minizip-ng_CONFIG}" _minizip-ng_VER_SEARCH 
+                REGEX "^[ \t]*#define[ \t]+MZ_VERSION[ \t]+\\(\"[.0-9]+\"\\).*$")
+            if(_minizip-ng_VER_SEARCH)
+                string(REGEX REPLACE ".*#define[ \t]+MZ_VERSION[ \t]+\\(\"([.0-9]+)\"\\).*" 
+                    "\\1" minizip-ng_VERSION "${_minizip-ng_VER_SEARCH}")
+            endif()
+        elseif(PC_minizip-ng_FOUND)
+            set(minizip-ng_VERSION "${PC_minizip-ng_VERSION}")
+        endif()
     endif()
 
     # Override REQUIRED if package can be installed
@@ -156,128 +174,120 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     )
 endif()
 
-if(NOT minizip-ng_FOUND)
-    # Looking for an external minizip-ng that might be built using MZ_COMPAT=ON.
-    # But do not download it if it cannot be found.
-    find_package(minizip ${minizip-ng_FIND_VERSION} REQUIRED)
+###############################################################################
+### Create target
+if(NOT TARGET MINIZIP::minizip-ng)
+    add_library(MINIZIP::minizip-ng UNKNOWN IMPORTED GLOBAL)
+    set(_minizip-ng_TARGET_CREATE TRUE)
 endif()
 
-if(NOT minizip_FOUND AND NOT TARGET MINIZIP::minizip)
-    ###############################################################################
-    ### Create target
-
-    if(NOT TARGET MINIZIP::minizip-ng)
-        add_library(MINIZIP::minizip-ng UNKNOWN IMPORTED GLOBAL)
-        set(_minizip-ng_TARGET_CREATE TRUE)
-    endif()
-
-    ###############################################################################
-    ### Install package from source ###
-    if(NOT minizip-ng_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-        include(ExternalProject)
-        include(GNUInstallDirs)
-
-        set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-        set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-        # Set find_package standard args
-        set(minizip-ng_FOUND TRUE)
-        set(minizip-ng_VERSION ${minizip-ng_FIND_VERSION})
-
-        set(minizip-ng_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng")
+###############################################################################
+### Install package from source ###
+
+if(NOT minizip-ng_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+    include(GNUInstallDirs)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(minizip-ng_FOUND TRUE)
+    set(minizip-ng_VERSION ${minizip-ng_FIND_VERSION})
+
+    set(minizip-ng_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng")
+
+    # Minizip-ng use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
+    set(_minizip-ng_LIB_PREFIX "lib")
+
+    set(minizip-ng_LIBRARY
+        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_minizip-ng_LIB_PREFIX}minizip-ng${_minizip-ng_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    if(_minizip-ng_TARGET_CREATE)
+        set(minizip-ng_CMAKE_ARGS
+            ${minizip-ng_CMAKE_ARGS}
+            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
+            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            # Since the other modules create a subfolder for the includes by default and since
+            # minizip-ng does not, a suffix is added to CMAKE_INSTALL_INCLUDEDIR in order to
+            # install the headers under a subdirectory named "minizip-ng".
+            # Note that this does not affect external builds for minizip-ng.
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+            -DBUILD_SHARED_LIBS=OFF
+            -DMZ_OPENSSL=OFF
+            -DMZ_LIBBSD=OFF
+            -DMZ_BUILD_TESTS=OFF
+            -DMZ_COMPAT=OFF
+            -DMZ_BZIP2=OFF
+            -DMZ_LZMA=OFF
+            -DMZ_LIBCOMP=OFF
+            -DMZ_ZSTD=OFF
+            -DMZ_PKCRYPT=OFF
+            -DMZ_WZAES=OFF
+            -DMZ_SIGNING=OFF
+            -DMZ_ZLIB=ON
+            -DMZ_ICONV=OFF
+            -DMZ_FETCH_LIBS=OFF
+            -DMZ_FORCE_FETCH_LIBS=OFF
+            -DZLIB_LIBRARY=${ZLIB_LIBRARIES}
+            -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS}
+        )
 
-        # Minizip-ng use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
-        set(_minizip-ng_LIB_PREFIX "lib")
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(minizip-ng_CMAKE_ARGS
+                ${minizip-ng_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
 
-        set(minizip-ng_LIBRARY
-            "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_minizip-ng_LIB_PREFIX}minizip-ng${_minizip-ng_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
 
-        if(_minizip-ng_TARGET_CREATE)
             set(minizip-ng_CMAKE_ARGS
                 ${minizip-ng_CMAKE_ARGS}
-                -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
-                -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
-                -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-                -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-                -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-                -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-                -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-                -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
-                -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-                # Since the other modules create a subfolder for the includes by default and since
-                # minizip-ng does not, a suffix is added to CMAKE_INSTALL_INCLUDEDIR in order to
-                # install the headers under a subdirectory named "minizip-ng".
-                # Note that this does not affect external builds for minizip-ng.
-                -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng
-                -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-                -DBUILD_SHARED_LIBS=OFF
-                -DMZ_OPENSSL=OFF
-                -DMZ_LIBBSD=OFF
-                -DMZ_BUILD_TESTS=OFF
-                -DMZ_COMPAT=OFF
-                -DMZ_BZIP2=OFF
-                -DMZ_LZMA=OFF
-                -DMZ_LIBCOMP=OFF
-                -DMZ_ZSTD=OFF
-                -DMZ_PKCRYPT=OFF
-                -DMZ_WZAES=OFF
-                -DMZ_SIGNING=OFF
-                -DMZ_ZLIB=ON
-                -DMZ_ICONV=OFF
-                -DMZ_FETCH_LIBS=OFF
-                -DMZ_FORCE_FETCH_LIBS=OFF
-                -DZLIB_LIBRARY=${ZLIB_LIBRARIES}
-                -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
             )
+        endif()
 
-            if(CMAKE_TOOLCHAIN_FILE)
-                set(minizip-ng_CMAKE_ARGS
-                    ${minizip-ng_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-            endif()
-
-            if(APPLE)
-                string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-                set(minizip-ng_CMAKE_ARGS
-                    ${minizip-ng_CMAKE_ARGS}
-                    -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                    -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-                )
-            endif()
-
-            if (ANDROID)
-                set(minizip-ng_CMAKE_ARGS
-                    ${minizip-ng_CMAKE_ARGS}
-                    -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                    -DANDROID_ABI=${ANDROID_ABI}
-                    -DANDROID_STL=${ANDROID_STL})
-            endif()
+        if (ANDROID)
+            set(minizip-ng_CMAKE_ARGS
+                ${minizip-ng_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
         endif()
+    endif()
 
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${minizip-ng_INCLUDE_DIR})
-
-        ExternalProject_Add(minizip-ng_install
-            GIT_REPOSITORY "https://github.com/zlib-ng/minizip-ng.git"
-            GIT_TAG "${minizip-ng_VERSION}"
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/libminizip-ng"
-            BUILD_BYPRODUCTS ${minizip-ng_LIBRARY}
-            CMAKE_ARGS ${minizip-ng_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-            BUILD_COMMAND ""
-            INSTALL_COMMAND
-                ${CMAKE_COMMAND} --build .
-                                --config ${CMAKE_BUILD_TYPE}
-                                --target install
-                                --parallel
-        )
+    # Hack to let imported target be built from ExternalProject_Add
+    file(MAKE_DIRECTORY ${minizip-ng_INCLUDE_DIR})
+
+    ExternalProject_Add(minizip-ng_install
+        GIT_REPOSITORY "https://github.com/zlib-ng/minizip-ng.git"
+        GIT_TAG "${minizip-ng_VERSION}"
+        GIT_CONFIG advice.detachedHead=false
+        GIT_SHALLOW TRUE
+        PREFIX "${_EXT_BUILD_ROOT}/libminizip-ng"
+        BUILD_BYPRODUCTS ${minizip-ng_LIBRARY}
+        CMAKE_ARGS ${minizip-ng_CMAKE_ARGS}
+        EXCLUDE_FROM_ALL TRUE
+        BUILD_COMMAND ""
+        INSTALL_COMMAND
+            ${CMAKE_COMMAND} --build .
+                            --config ${CMAKE_BUILD_TYPE}
+                            --target install
+                            --parallel
+    )
 
-        add_dependencies(MINIZIP::minizip-ng minizip-ng_install)
-        message(STATUS "Installing minizip-ng: ${minizip-ng_LIBRARY} (version \"${minizip-ng_VERSION}\")")
-    endif()
+    add_dependencies(MINIZIP::minizip-ng minizip-ng_install)
+    message(STATUS "Installing minizip-ng: ${minizip-ng_LIBRARY} (version \"${minizip-ng_VERSION}\")")
 endif()
 
 ###############################################################################
diff --git a/share/cmake/modules/Findminizip.cmake b/share/cmake/modules/Findminizip.cmake
deleted file mode 100644
index 1a75747177..0000000000
--- a/share/cmake/modules/Findminizip.cmake
+++ /dev/null
@@ -1,172 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-#
-# Locate minizip-ng with MZ_COMPAT=ON.
-# 
-# This module DOES NOT install minizip-ng with MZ_COMPAT=ON if it is not found.
-#
-# Note: This option changes the name for the library file to "libminizip", but it is still
-#       actually minizip-ng. OCIO uses the API from minizip-ng.
-#
-# Variables defined by this module:
-#   minizip_FOUND        - If FALSE, do not try to link to minizip
-#   minizip_LIBRARY      - minizip library to link to
-#   minizip_INCLUDE_DIR  - Where to find mz.h and other headers
-#   minizip_VERSION      - The version of the library
-#
-#   This module set the variables below because this is still minizip-ng. The librarie become
-#   "minizip" because of the cmake option MZ_COMPAT=ON.
-#
-#   minizip-ng_FOUND        - If FALSE, do not try to link to minizip-ng
-#   minizip-ng_LIBRARY      - minizip-ng library to link to
-#   minizip-ng_INCLUDE_DIR  - Where to find mz.h and other headers
-#   minizip-ng_VERSION      - The version of the library
-#
-# Targets defined by this module:
-#   MINIZIP::minizip-ng - IMPORTED target, if found
-#
-###############################################################################
-### Try to find package ###
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-
-    if(NOT DEFINED minizip_ROOT)
-        # Search for minizip-config.cmake
-        find_package(minizip ${minizip_FIND_VERSION} CONFIG QUIET)
-    endif()
-
-    if (minizip_FOUND)
-        get_target_property(minizip_INCLUDE_DIR MINIZIP::minizip INTERFACE_INCLUDE_DIRECTORIES)
-        get_target_property(minizip_LIBRARY MINIZIP::minizip LOCATION)
-
-        if (NOT minizip_LIBRARY)
-            # Lib names to search for
-            set(_minizip_LIB_NAMES minizip)
-
-            if(BUILD_TYPE_DEBUG)
-                # Prefer Debug lib names (Windows only)
-                list(INSERT _minizip_LIB_NAMES 0 minizipd)
-            endif()
-
-            if(minizip_STATIC_LIBRARY)
-                # Prefer static lib names
-                set(_minizip_STATIC_LIB_NAMES 
-                    "${CMAKE_STATIC_LIBRARY_PREFIX}minizip${CMAKE_STATIC_LIBRARY_SUFFIX}")
-                if(WIN32 AND BUILD_TYPE_DEBUG)
-                    # Prefer static Debug lib names (Windows only)
-                    list(INSERT _minizip_STATIC_LIB_NAMES 0
-                        "${CMAKE_STATIC_LIBRARY_PREFIX}minizipd${CMAKE_STATIC_LIBRARY_SUFFIX}")
-                endif()
-            endif()
-
-            # Find library
-            find_library(minizip_LIBRARY
-                NAMES
-                    ${_minizip_STATIC_LIB_NAMES}
-                    ${_minizip_LIB_NAMES}
-                HINTS
-                    ${minizip_ROOT}
-                    ${PC_minizip_LIBRARY_DIRS}
-                PATH_SUFFIXES
-                    lib64 lib 
-            )
-        endif()
-    else()
-        list(APPEND _minizip_REQUIRED_VARS minizip_INCLUDE_DIR)
-
-        # Search for minizip.pc
-        find_package(PkgConfig QUIET)
-        pkg_check_modules(PC_minizip QUIET "minizip>=${minizip_FIND_VERSION}")
-
-        # Find include directory
-        find_path(minizip_INCLUDE_DIR
-            NAMES
-                mz.h
-            HINTS
-                ${minizip_ROOT}
-                ${PC_minizip_INCLUDE_DIRS}
-            PATH_SUFFIXES
-                include
-                minizip/include
-        )
-
-        # Lib names to search for
-        set(_minizip_LIB_NAMES minizip)
-
-        if(BUILD_TYPE_DEBUG)
-            # Prefer Debug lib names (Windows only)
-            list(INSERT _minizip_LIB_NAMES 0 minizipd)
-        endif()
-
-        if(minizip_STATIC_LIBRARY)
-            # Prefer static lib names
-            set(_minizip_STATIC_LIB_NAMES 
-                "${CMAKE_STATIC_LIBRARY_PREFIX}minizip${CMAKE_STATIC_LIBRARY_SUFFIX}")
-            if(WIN32 AND BUILD_TYPE_DEBUG)
-                # Prefer static Debug lib names (Windows only)
-                list(INSERT _minizip_STATIC_LIB_NAMES 0
-                    "${CMAKE_STATIC_LIBRARY_PREFIX}minizipd${CMAKE_STATIC_LIBRARY_SUFFIX}")
-            endif()
-        endif()
-
-        # Find library
-        find_library(minizip_LIBRARY
-            NAMES
-                ${_minizip_STATIC_LIB_NAMES}
-                ${_minizip_LIB_NAMES}
-            HINTS
-                ${minizip_ROOT}
-                ${PC_minizip_LIBRARY_DIRS}
-            PATH_SUFFIXES
-                lib64
-                lib 
-        )
-
-        # Get version from header or pkg-config
-        set(minizip_VERSION "${minizip_FIND_VERSION}")
-    endif()
-
-    # Override REQUIRED if package can be installed
-    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
-        set(minizip_FIND_REQUIRED FALSE)
-    endif()
-
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(minizip
-        REQUIRED_VARS 
-            minizip_LIBRARY
-            minizip_INCLUDE_DIR
-        VERSION_VAR
-            minizip_VERSION
-    )
-endif()
-
-###############################################################################
-### Create target
-
-if(minizip_FOUND AND NOT TARGET MINIZIP::minizip-ng)
-    add_library(MINIZIP::minizip-ng UNKNOWN IMPORTED GLOBAL)
-    set(_minizip_TARGET_CREATE TRUE)
-endif()
-
-###############################################################################
-
-###############################################################################
-### Configure target ###
-
-if(minizip_FOUND AND _minizip_TARGET_CREATE)
-    set_target_properties(MINIZIP::minizip-ng PROPERTIES
-        IMPORTED_LOCATION "${minizip_LIBRARY}"
-        INTERFACE_INCLUDE_DIRECTORIES "${minizip_INCLUDE_DIR}"
-    )
-
-    # This is still minizip-ng even though the library is called minizip because of MZ_COMPAT=ON.
-    set(minizip-ng_LIBRARY ${minizip_LIBRARY})
-    set(minizip-ng_INCLUDE_DIR ${minizip_INCLUDE_DIR})
-    set(minizip-ng_FOUND ${minizip_FOUND})
-    set(minizip-ng_VERSION ${minizip_VERSION})
-
-    mark_as_advanced(minizip_INCLUDE_DIR minizip_LIBRARY minizip_VERSION)
-
-    target_link_libraries(MINIZIP::minizip-ng INTERFACE ZLIB::ZLIB)
-endif()
\ No newline at end of file
diff --git a/share/cmake/modules/Findopenfx.cmake b/share/cmake/modules/Findopenfx.cmake
index f989e0f7ae..8b0eefb344 100644
--- a/share/cmake/modules/Findopenfx.cmake
+++ b/share/cmake/modules/Findopenfx.cmake
@@ -58,7 +58,7 @@ endif()
 ###############################################################################
 ### Install package from source ###
 
-if(NOT openfx_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT openfx_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
     include(GNUInstallDirs)
 
diff --git a/share/cmake/modules/Findpybind11.cmake b/share/cmake/modules/Findpybind11.cmake
index 316496096c..e6c2f35a86 100644
--- a/share/cmake/modules/Findpybind11.cmake
+++ b/share/cmake/modules/Findpybind11.cmake
@@ -134,7 +134,7 @@ endif()
 ###############################################################################
 ### Install package from source ###
 
-if(NOT pybind11_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT pybind11_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
     include(GNUInstallDirs)
 
diff --git a/share/cmake/modules/Findpystring.cmake b/share/cmake/modules/Findpystring.cmake
index 702ac1e812..bbd5e1fd27 100644
--- a/share/cmake/modules/Findpystring.cmake
+++ b/share/cmake/modules/Findpystring.cmake
@@ -66,7 +66,7 @@ endif()
 ###############################################################################
 ### Install package from source ###
 
-if(NOT pystring_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT pystring_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
 
     set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
@@ -127,14 +127,6 @@ if(NOT pystring_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
                 -DANDROID_STL=${ANDROID_STL})
         endif()
 
-        if(NOT BUILD_SHARED_LIBS)
-            #TODO: Find a way to merge in the static libs when built with internal pystring
-            message(WARNING
-                "Building STATIC libOpenColorIO using the in-built pystring. "
-                "pystring symbols are NOT included in the output binary!"
-            )
-        endif()
-
         # Hack to let imported target be built from ExternalProject_Add
         file(MAKE_DIRECTORY ${pystring_INCLUDE_DIR})
 
diff --git a/share/cmake/modules/Findyaml-cpp.cmake b/share/cmake/modules/Findyaml-cpp.cmake
index 023e14fc07..59558ce6a9 100644
--- a/share/cmake/modules/Findyaml-cpp.cmake
+++ b/share/cmake/modules/Findyaml-cpp.cmake
@@ -25,6 +25,15 @@
 ###############################################################################
 ### Try to find package ###
 
+# BUILD_TYPE_DEBUG variable is currently set in one of the OCIO's CMake files. 
+# Now that some OCIO's find module are installed with the library itself (with static build), 
+# a consumer app don't have access to the variables set by an OCIO's CMake files. Therefore, some 
+# OCIO's find modules must detect the build type by itselves.  
+set(BUILD_TYPE_DEBUG OFF)
+if(CMAKE_BUILD_TYPE MATCHES "[Dd][Ee][Bb][Uu][Gg]")
+   set(BUILD_TYPE_DEBUG ON)
+endif()
+
 if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
     set(_yaml-cpp_REQUIRED_VARS yaml-cpp_LIBRARY)
 
@@ -66,22 +75,21 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
 
         # Lib names to search for
         set(_yaml-cpp_LIB_NAMES yaml-cpp)
-        if(WIN32 AND BUILD_TYPE_DEBUG AND NOT MINGW)
-            # Prefer Debug lib names (Windows only)
+        if(BUILD_TYPE_DEBUG)
+            # Prefer Debug lib names.
             list(INSERT _yaml-cpp_LIB_NAMES 0 yaml-cppd)
         endif()
 
         if(yaml-cpp_STATIC_LIBRARY)
             # Prefer static lib names
-            if(WIN32 AND NOT MINGW)
-                set(_yaml-cpp_LIB_SUFFIX "md")
-            endif()
             set(_yaml-cpp_STATIC_LIB_NAMES 
-                "libyaml-cpp${_yaml-cpp_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-            if(WIN32 AND BUILD_TYPE_DEBUG AND NOT MINGW)
-                # Prefer static Debug lib names (Windows only)
+                "${CMAKE_STATIC_LIBRARY_PREFIX}yaml-cpp${CMAKE_STATIC_LIBRARY_SUFFIX}")
+            
+            # Starting from 0.7.0, all platforms uses the suffix "d" for debug.
+            # See https://github.com/jbeder/yaml-cpp/blob/master/CMakeLists.txt#L141
+            if(BUILD_TYPE_DEBUG)
                 list(INSERT _yaml-cpp_STATIC_LIB_NAMES 0
-                    "libyaml-cpp${_yaml-cpp_LIB_SUFFIX}d${CMAKE_STATIC_LIBRARY_SUFFIX}")
+                    "${CMAKE_STATIC_LIBRARY_PREFIX}yaml-cppd${CMAKE_STATIC_LIBRARY_SUFFIX}")
             endif()
         endif()
 
@@ -91,7 +99,7 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
                 ${_yaml-cpp_STATIC_LIB_NAMES}
                 ${_yaml-cpp_LIB_NAMES}
             HINTS 
-                ${_yaml-cpp_ROOT}
+                ${yaml-cpp_ROOT}
                 ${PC_yaml-cpp_LIBRARY_DIRS}
             PATH_SUFFIXES 
                 lib64
@@ -130,7 +138,7 @@ endif()
 ###############################################################################
 ### Install package from source ###
 
-if(NOT yaml-cpp_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+if(NOT yaml-cpp_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
     include(GNUInstallDirs)
 
@@ -213,14 +221,6 @@ if(NOT yaml-cpp_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
                 -DANDROID_STL=${ANDROID_STL})
         endif()
 
-        if(NOT BUILD_SHARED_LIBS)
-            #TODO: Find a way to merge in the static libs when built with internal yamlcpp
-            message(WARNING
-                "Building STATIC libOpenColorIO using the in-built yaml-cpp. "
-                "yaml-cpp symbols are NOT included in the output binary!"
-            )
-        endif()
-
         set(yaml-cpp_GIT_TAG "yaml-cpp-${yaml-cpp_VERSION}")
 
         # Hack to let imported target be built from ExternalProject_Add
diff --git a/share/cmake/modules/Findzlib.cmake b/share/cmake/modules/InstallZLIB.cmake
similarity index 73%
rename from share/cmake/modules/Findzlib.cmake
rename to share/cmake/modules/InstallZLIB.cmake
index 546cea341a..954563ec4e 100644
--- a/share/cmake/modules/Findzlib.cmake
+++ b/share/cmake/modules/InstallZLIB.cmake
@@ -1,75 +1,58 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install minizip-ng
+# Install ZLIB
+#
+# Except for the variable ZLIB_VERSION_STRING, OCIO sets the same variables as the 
+# CMake's FindZLIB when installing ZLIB manually.
 #
 # Variables defined by this module:
-#   ZLIB_FOUND          - If FALSE, do not try to link to minizip-ng
+#   ZLIB_FOUND          - If FALSE, do not try to link to zlib
 #   ZLIB_LIBRARIES      - ZLIB library to link to
 #   ZLIB_INCLUDE_DIRS   - Where to find zlib.h and other headers
 #   ZLIB_VERSION        - The version of the library
 #
 # Targets defined by this module:
-#   ZLIB::ZLIB - IMPORTED target, if found
-#
-# This module is named GetZLIB because it is not used with find_package().
-# It must be included using include(). 
-#
-# The reason is that CMake provide a FindZLIB already and the current file is
-# using it.
+#   ZLIB::ZLIB          - Properties:
+#                         IMPORTED_LOCATION ${ZLIB_LIBRARIES}
+#                         INTERFACE_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIRS}
 #
 ###############################################################################
-### Try to find package ###
-
-# Assign the rigtt name for ZLIB depending on the OS.
-if(WIN32)
-    set(_ZLIB_LIB_NAME "zlib")
-    set(_ZLIB_STATIC_LIB_NAME "zlibstatic")
-else()
-    set(_ZLIB_LIB_NAME "z")
-    set(_ZLIB_STATIC_LIB_NAME "z")
-endif()
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-    set(_ZLIB_REQUIRED_VARS ZLIB_LIBRARIES)
-
-    if(NOT DEFINED ZLIB_ROOT)
-        # Save old value of CMAKE_MODULE_PATH 
-        set(_ZLIB__CMAKE_MODULE_PATH_OLD_ ${CMAKE_MODULE_PATH})
-        # Force find_package to use CMAKE module and not custom modules.
-        set(CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules") 
-
-        # Use CMake FindZLIB module
-        find_package(ZLIB ${zlib_FIND_VERSION})
-
-        # Restore CMAKE_MODULE_PATH
-        set(CMAKE_MODULE_PATH ${_ZLIB__CMAKE_MODULE_PATH_OLD_})
-    endif()
-endif()
 
 ###############################################################################
 ### Create target
-
+###############################################################################
 if(NOT TARGET ZLIB::ZLIB)
+    add_library(ZLIB::ZLIB UNKNOWN IMPORTED GLOBAL)
     set(_ZLIB_TARGET_CREATE TRUE)
 endif()
 
 ###############################################################################
-### Install package from source ###
-if(NOT ZLIB_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+### Install package from source
+###############################################################################
+if(NOT ZLIB_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
     include(ExternalProject)
     include(GNUInstallDirs)
 
     set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
     set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
 
+    if(WIN32)
+        set(_ZLIB_LIB_NAME "zlib")
+        set(_ZLIB_STATIC_LIB_NAME "zlibstatic")
+    else()
+        set(_ZLIB_LIB_NAME "z")
+        set(_ZLIB_STATIC_LIB_NAME "z")
+    endif()
+
     # Set find_package standard args
     set(ZLIB_FOUND TRUE)
-    if(_zlib_ExternalProject_VERSION)
-        set(ZLIB_VERSION ${_zlib_ExternalProject_VERSION})
+    if(_ZLIB_ExternalProject_VERSION)
+        set(ZLIB_VERSION ${_ZLIB_ExternalProject_VERSION})
     else()
-        set(ZLIB_VERSION ${zlib_FIND_VERSION})
+        set(ZLIB_VERSION ${ZLIB_FIND_VERSION})
     endif()
+
     set(ZLIB_INCLUDE_DIRS "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
 
     # Windows need the "d" suffix at the end.
@@ -149,22 +132,19 @@ if(NOT ZLIB_FOUND AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
         COMMAND ${CMAKE_COMMAND} -E remove -f ${_EXT_DIST_ROOT}/${_ZLIB_INSTALL_LIBDIR}/zlib.lib ${_EXT_DIST_ROOT}/bin/zlib.dll
     )
 
-    add_library(ZLIB::ZLIB STATIC IMPORTED GLOBAL)
     add_dependencies(ZLIB::ZLIB ZLIB_install)
-    set_property(TARGET ZLIB::ZLIB PROPERTY
-        IMPORTED_LOCATION "${ZLIB_LIBRARIES}"
-    )
-    target_include_directories(ZLIB::ZLIB INTERFACE "${CMAKE_INSTALL_BINDIR}/include")
-
+    
+    # Setting those variables to follow the same naming as the other OCIO custom find modules.
     set(ZLIB_LIBRARY ${ZLIB_LIBRARIES})
     set(ZLIB_INCLUDE_DIR ${ZLIB_INCLUDE_DIRS})
 
     message(STATUS "Installing ZLIB: ${ZLIB_LIBRARIES} (version \"${ZLIB_VERSION}\")")
 endif()
 
+
 ###############################################################################
 ### Configure target ###
-
+###############################################################################
 if(_ZLIB_TARGET_CREATE)
     set_target_properties(ZLIB::ZLIB PROPERTIES
         IMPORTED_LOCATION ${ZLIB_LIBRARIES}
diff --git a/src/cmake/Config.cmake.in b/src/cmake/Config.cmake.in
index feabdeb2e3..6a4932a836 100644
--- a/src/cmake/Config.cmake.in
+++ b/src/cmake/Config.cmake.in
@@ -2,6 +2,129 @@
 
 include(CMakeFindDependencyMacro)
 
+if (NOT @BUILD_SHARED_LIBS@) # NOT @BUILD_SHARED_LIBS@
+    if (APPLE)
+        # Store the previous value of CMAKE_FIND_FRAMEWORK and CMAKE_FIND_APPBUNDLE.
+        set(_PREVIOUS_CMAKE_FIND_FRAMEWORK ${CMAKE_FIND_FRAMEWORK})
+        set(_PREVIOUS_CMAKE_FIND_APPBUNDLE ${CMAKE_FIND_APPBUNDLE})
+
+        # Prioritize other paths before Frameworks and Appbundle for find_path, find_library and 
+        # find_package.
+        set(CMAKE_FIND_FRAMEWORK LAST)
+        set(CMAKE_FIND_APPBUNDLE LAST)
+    endif()
+
+    # Get the install directory.
+    set(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_DIR}")
+    # Get the install directory. Since the current file is under 
+    # <install directory>/lib/cmake/OpenColorIO going back three directory.
+    foreach(i RANGE 1 3)
+        get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+        if(_IMPORT_PREFIX STREQUAL "/")
+        set(_IMPORT_PREFIX "")
+        break()
+        endif()
+    endforeach()
+
+    # Append OCIO custom find module path.
+    list(APPEND CMAKE_MODULE_PATH "${_IMPORT_PREFIX}/share/OpenColorIO/cmake/modules")
+    list(APPEND CMAKE_MODULE_PATH "${_IMPORT_PREFIX}/share/OpenColorIO/cmake/macros")
+
+    ########################
+    # Required dependencies 
+    ########################
+
+    if (NOT expat::expat)
+        find_dependency(expat @expat_VERSION@)
+    endif()
+
+    if (NOT Imath::Imath)
+        find_dependency(Imath @Imath_VERSION@)
+    endif()
+
+    if (NOT pystring::pystring)
+        find_dependency(pystring @pystring_VERSION@)
+    endif()
+
+    if (NOT yaml-cpp)
+        find_dependency(yaml-cpp @yaml-cpp_VERSION@)
+    endif()
+
+    if (NOT ZLIB::ZLIB)
+        # ZLIB_VERSION is available starting CMake 3.26+.
+        # ZLIB_VERSION_STRING is still available for backward compatibility.
+        # See https://cmake.org/cmake/help/git-stage/module/FindZLIB.html
+
+        # ZLIB_USE_STATIC_LIBS is supported only from CMake 3.24+.
+        if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") 
+            if (ZLIB_STATIC_LIBRARY)
+                set(ZLIB_USE_STATIC_LIBS "${ZLIB_STATIC_LIBRARY}")
+            endif()
+        else() # For CMake < 3.24 since ZLIB_USE_STATIC_LIBS is not available.
+            if(NOT ZLIB_LIBRARY)
+                if(DEFINED CMAKE_FIND_LIBRARY_PREFIXES)
+                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}")
+                else()
+                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
+                endif()
+
+                if(DEFINED CMAKE_FIND_LIBRARY_SUFFIXES)
+                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_FIND_LIBRARY_SUFFIXES}")
+                else()
+                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
+                endif()
+
+                # Prefix/suffix for windows.
+                if(WIN32)
+                    list(APPEND CMAKE_FIND_LIBRARY_PREFIXES "" "lib")
+                    list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES ".dll.a")
+                endif()
+
+                # Check if static lib is preferred.
+                if(ZLIB_STATIC_LIBRARY OR ZLIB_USE_STATIC_LIBS)
+                    if(WIN32)
+                        set(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+                    else()
+                        set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
+                    endif()
+                endif()
+            endif()
+        endif()
+
+        if (@ZLIB_VERSION@) # @ZLIB_VERSION@
+            find_dependency(ZLIB @ZLIB_VERSION@)
+        else()
+            find_dependency(ZLIB @ZLIB_VERSION_STRING@)
+        endif()
+
+        # Restore the original find library ordering
+        if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
+            set(CMAKE_FIND_LIBRARY_SUFFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}")
+        else()
+            set(CMAKE_FIND_LIBRARY_SUFFIXES)
+        endif()
+
+        if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
+            set(CMAKE_FIND_LIBRARY_PREFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES}")
+        else()
+            set(CMAKE_FIND_LIBRARY_PREFIXES)
+        endif()
+    endif()
+
+    if (NOT MINIZIP::minizip-ng)
+        find_dependency(minizip-ng @minizip-ng_VERSION@)
+    endif()
+
+    # Remove OCIO custom find module path.
+    list(REMOVE_AT CMAKE_MODULE_PATH -1)
+
+    if (APPLE)
+        # Restore CMAKE_FIND_FRAMEWORK and CMAKE_FIND_APPBUNDLE values.
+        set(CMAKE_FIND_FRAMEWORK ${_PREVIOUS_CMAKE_FIND_FRAMEWORK})
+        set(CMAKE_FIND_APPBUNDLE ${_PREVIOUS_CMAKE_FIND_APPBUNDLE})
+    endif()
+endif()
+
 include(${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake)
 
 include(FindPackageHandleStandardArgs)

From 41afaecdccc0f9531fc09ab27affa84c045827c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Achard?= <remiachard@gmail.com>
Date: Fri, 27 Jan 2023 17:31:08 +0000
Subject: [PATCH 47/81] Update macOS runners to 10.11, bump actions version
 (#1714)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Achard <remiachard@gmail.com>
Co-authored-by: Michael Dolan <michdolan@gmail.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .github/workflows/analysis_workflow.yml | 12 +++++-----
 .github/workflows/ci_workflow.yml       | 14 ++++++------
 .github/workflows/wheel_workflow.yml    | 30 ++++++++++++-------------
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/analysis_workflow.yml b/.github/workflows/analysis_workflow.yml
index 3b7443da57..6c9c32cf8c 100644
--- a/.github/workflows/analysis_workflow.yml
+++ b/.github/workflows/analysis_workflow.yml
@@ -90,7 +90,7 @@ jobs:
       CC: ${{ matrix.cc-compiler }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Install docs env
         run: share/ci/scripts/linux/yum/install_docs_env.sh
         if: matrix.build-docs == 'ON'
@@ -189,11 +189,11 @@ jobs:
             python-version: 3.9
     steps:
       - name: Setup Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Install docs env
         run: share/ci/scripts/macos/install_docs_env.sh
         if: matrix.build-docs == 'ON'
@@ -295,11 +295,11 @@ jobs:
             python-version: 3.9
     steps:
       - name: Setup Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Install docs env
         run: share/ci/scripts/windows/install_docs_env.sh
         shell: bash
@@ -418,7 +418,7 @@ jobs:
       CC: gcc
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
         with:
           fetch-depth: 50
       - name: Install docs env
diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
index 77527cd723..ff87ed3c47 100644
--- a/.github/workflows/ci_workflow.yml
+++ b/.github/workflows/ci_workflow.yml
@@ -251,7 +251,7 @@ jobs:
       CC: ${{ matrix.cc-compiler }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       # minizip-ng requires CMake 3.13+ but VFX2019 image ships with 3.12
       - name: Upgrade VFX2019 CMake
         run: pip install cmake==3.13.3
@@ -336,7 +336,7 @@ jobs:
   # ---------------------------------------------------------------------------
 
   macos:
-    name: 'macOS 10.15 
+    name: 'macOS 11
       <AppleClang 11.0 
        config=${{ matrix.build-type }}, 
        shared=${{ matrix.build-shared }}, 
@@ -349,7 +349,7 @@ jobs:
     if: |
       github.event_name == 'push' ||
       github.event.pull_request.head.repo.full_name != github.repository
-    runs-on: macos-10.15
+    runs-on: macos-11
     strategy:
       matrix:
         build: [1, 2, 3, 4, 5, 6]
@@ -417,11 +417,11 @@ jobs:
             python-version: 2.7
     steps:
       - name: Setup Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Install docs env
         run: share/ci/scripts/macos/install_docs_env.sh
         if: matrix.build-docs == 'ON'
@@ -585,11 +585,11 @@ jobs:
             python-version: 2.7
     steps:
       - name: Setup Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Install docs env
         run: share/ci/scripts/windows/install_docs_env.sh
         shell: bash
diff --git a/.github/workflows/wheel_workflow.yml b/.github/workflows/wheel_workflow.yml
index 9db00dec80..737a736019 100644
--- a/.github/workflows/wheel_workflow.yml
+++ b/.github/workflows/wheel_workflow.yml
@@ -51,7 +51,7 @@ jobs:
 
     steps:
 
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Build SDist
       run: pipx run build --sdist
@@ -59,7 +59,7 @@ jobs:
     - name: Check metadata
       run: pipx run twine check dist/*
 
-    - uses: actions/upload-artifact@v2
+    - uses: actions/upload-artifact@v3
       with:
         path: dist/*.tar.gz
 
@@ -115,15 +115,15 @@ jobs:
             arch: aarch64
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
         name: Install Python
         with:
           python-version: '3.8'
 
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v1
+        uses: docker/setup-qemu-action@v2
         with:
           platforms: all
 
@@ -133,7 +133,7 @@ jobs:
           CIBW_BUILD: ${{ matrix.python }}
           CIBW_ARCHS: ${{ matrix.arch }}
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           path: ./wheelhouse/*.whl
 
@@ -143,7 +143,7 @@ jobs:
 
   macos:
     name: Build wheels on macOS
-    runs-on: macos-10.15
+    runs-on: macos-11
     # Don't run on OCIO forks
     if: |
       github.event_name != 'schedule' ||
@@ -186,9 +186,9 @@ jobs:
             arch: arm64
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
         name: Install Python
         with:
           python-version: '3.8'
@@ -199,7 +199,7 @@ jobs:
           CIBW_BUILD: ${{ matrix.python }}
           CIBW_ARCHS: ${{ matrix.arch }}
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           path: ./wheelhouse/*.whl
 
@@ -237,9 +237,9 @@ jobs:
             arch: AMD64
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
         name: Install Python
         with:
           python-version: '3.8'
@@ -250,7 +250,7 @@ jobs:
           CIBW_BUILD: ${{ matrix.python }}
           CIBW_ARCHS: ${{ matrix.arch }}
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         with:
           path: ./wheelhouse/*.whl
 
@@ -260,9 +260,9 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
     steps:
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
 
-      - uses: actions/download-artifact@v2
+      - uses: actions/download-artifact@v3
         with:
           name: artifact
           path: dist

From 0221599023c5b47bb13575d8fa828bbb1236f419 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Achard?= <remiachard@gmail.com>
Date: Thu, 9 Mar 2023 03:07:11 +0000
Subject: [PATCH 48/81] Update CI workflows (#1770)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update CI workflows

Signed-off-by: Rémi Achard <remiachard@gmail.com>

* Test building OpenEXR single threaded

Signed-off-by: Rémi Achard <remiachard@gmail.com>

* Add comment on disabling parallel build

Signed-off-by: Rémi Achard <remiachard@gmail.com>

* Update python_requires

Signed-off-by: Rémi Achard <remiachard@gmail.com>

* Remove comments around CI jobs

Signed-off-by: Rémi Achard <remiachard@gmail.com>

---------

Signed-off-by: Rémi Achard <remiachard@gmail.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .github/workflows/analysis_workflow.yml       | 375 -----------------
 .github/workflows/ci_workflow.yml             | 208 ++-------
 .github/workflows/dependencies_latest.yml     | 388 +++++++++++++++++
 .github/workflows/platform_latest.yml         | 396 ++++++++++++++++++
 .github/workflows/wheel_workflow.yml          |   6 +-
 CMakeLists.txt                                |  14 +-
 pyproject.toml                                |   2 +-
 setup.cfg                                     |   4 +-
 share/cmake/modules/FindExtPackages.cmake     |   2 +-
 share/cmake/modules/FindOpenEXR.cmake         |   3 +-
 share/cmake/projects/Buildlcms2.cmake         |   5 +-
 share/cmake/projects/Buildpystring.cmake      |   5 +-
 share/cmake/utils/CompilerFlags.cmake         |  41 +-
 src/OpenColorIO/CMakeLists.txt                |  25 +-
 src/apps/ocioarchive/CMakeLists.txt           |   4 +-
 src/apps/ociobakelut/CMakeLists.txt           |   5 +-
 src/apps/ociocheck/CMakeLists.txt             |   6 +-
 src/apps/ociochecklut/CMakeLists.txt          |   6 +-
 src/apps/ocioconvert/CMakeLists.txt           |   3 +-
 src/apps/ociodisplay/CMakeLists.txt           |   7 +-
 src/apps/ociolutimage/CMakeLists.txt          |   4 +-
 src/apps/ociomakeclf/CMakeLists.txt           |   6 +-
 src/apps/ocioperf/CMakeLists.txt              |   4 +-
 src/apps/ociowrite/CMakeLists.txt             |   4 +-
 src/apputils/CMakeLists.txt                   |   3 +-
 src/bindings/python/CMakeLists.txt            |  26 +-
 src/libutils/imageioapphelpers/CMakeLists.txt |   3 +-
 src/libutils/oglapphelpers/CMakeLists.txt     |   7 +-
 src/libutils/oglapphelpers/msl.mm             |   2 +-
 tests/cmake-consumer/CMakeLists.txt.in        |  10 +-
 tests/cpu/CMakeLists.txt                      |  16 +-
 tests/cpu/Platform_tests.cpp                  |   4 +-
 tests/cpu/UnitTestUtils.cpp                   |   5 +-
 .../xmlutils/XMLReaderUtils_tests.cpp         |   7 +-
 tests/gpu/CMakeLists.txt                      |   4 +-
 tests/osl/CMakeLists.txt                      |   4 +-
 tests/testutils/CMakeLists.txt                |   5 +-
 tests/utils/CMakeLists.txt                    |   4 +-
 vendor/openfx/CMakeLists.txt                  |   7 +-
 39 files changed, 990 insertions(+), 640 deletions(-)
 create mode 100644 .github/workflows/dependencies_latest.yml
 create mode 100644 .github/workflows/platform_latest.yml

diff --git a/.github/workflows/analysis_workflow.yml b/.github/workflows/analysis_workflow.yml
index 6c9c32cf8c..56a243bcd0 100644
--- a/.github/workflows/analysis_workflow.yml
+++ b/.github/workflows/analysis_workflow.yml
@@ -22,381 +22,6 @@ on:
     - cron: "0 0 * * *"
 
 jobs:
-  # ---------------------------------------------------------------------------
-  # Linux latest ext packages
-  # ---------------------------------------------------------------------------
-
-  linux_latest:
-    name: 'Linux CentOS 7 VFX CY${{ matrix.vfx-cy }} latest 
-      <${{ matrix.compiler-desc }} 
-       cxx=${{ matrix.cxx-standard }}, 
-       docs=${{ matrix.build-docs }}>'
-    # Don't run on OCIO forks
-    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
-    # GH-hosted VM. The build runs in CentOS 7 'container' defined below.
-    runs-on: ubuntu-latest
-    container:
-      # DockerHub: https://hub.docker.com/u/aswf
-      # Source: https://github.com/AcademySoftwareFoundation/aswf-docker
-      image: aswf/ci-base:${{ matrix.vfx-cy }}
-    strategy:
-      matrix:
-        build: [1, 2, 3, 4]
-        include:
-          # -------------------------------------------------------------------
-          # GCC
-          # -------------------------------------------------------------------
-          # VFX CY2022, C++17, docs, OpenFX
-          - build: 1
-            build-docs: 'ON'
-            build-openfx: 'ON'
-            cxx-standard: 17
-            cxx-compiler: g++
-            cc-compiler: gcc
-            compiler-desc: GCC 9.3.1
-            vfx-cy: 2022
-          # VFX CY2021, C++14
-          - build: 2
-            build-docs: 'OFF'
-            build-openfx: 'OFF'
-            cxx-standard: 14
-            cxx-compiler: g++
-            cc-compiler: gcc
-            compiler-desc: GCC 9.3.1
-            vfx-cy: 2021
-          # -------------------------------------------------------------------
-          # Clang
-          # -------------------------------------------------------------------
-          # VFX CY2022, C++17
-          - build: 3
-            build-docs: 'OFF'
-            build-openfx: 'OFF'
-            cxx-standard: 17
-            cxx-compiler: clang++
-            cc-compiler: clang
-            compiler-desc: Clang 9
-            vfx-cy: 2022
-          # VFX CY2021, C++14, docs, OpenFX
-          - build: 4
-            build-docs: 'ON'
-            build-openfx: 'ON'
-            cxx-standard: 14
-            cxx-compiler: clang++
-            cc-compiler: clang
-            compiler-desc: Clang 9
-            vfx-cy: 2021
-    env:
-      CXX: ${{ matrix.cxx-compiler }}
-      CC: ${{ matrix.cc-compiler }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Install docs env
-        run: share/ci/scripts/linux/yum/install_docs_env.sh
-        if: matrix.build-docs == 'ON'
-      - name: Install tests env
-        run: share/ci/scripts/linux/yum/install_tests_env.sh
-      - name: Setup ext environment
-        run: |
-          EXT_PATH=/usr/local
-          echo "EXT_PATH=$EXT_PATH" >> $GITHUB_ENV
-      - name: Install indirect dependencies
-        run: |
-          share/ci/scripts/multi/install_pugixml.sh latest
-      - name: Install fixed ext package versions
-        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
-        run: |
-          share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
-          share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
-          share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
-          share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
-          share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
-          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
-          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
-      - name: Install latest ext package versions
-        run: |
-          share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_openexr.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_oiio.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_osl.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_openfx.sh latest $EXT_PATH
-      - name: Create build directories
-        run: |
-          mkdir _install
-          mkdir _build
-      - name: Configure
-        run: |
-          cmake ../. \
-                -DCMAKE_INSTALL_PREFIX=../_install \
-                -DCMAKE_BUILD_TYPE=Release \
-                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
-                -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
-                -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
-                -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_INSTALL_EXT_PACKAGES=NONE \
-                -DOCIO_WARNING_AS_ERROR=OFF \
-                -DPython_EXECUTABLE=$(which python) \
-                -DOCIO_USE_OIIO_CMAKE_CONFIG=ON
-        working-directory: _build
-      - name: Build
-        run: |
-          cmake --build . \
-                --target install \
-                --config Release \
-                -- -j$(nproc)
-        working-directory: _build
-      - name: Test
-        run: ctest -V -C Release
-        working-directory: _build
-      - name: Test CMake Consumer
-        run: |
-          cmake . \
-                -DCMAKE_PREFIX_PATH=../../../_install \
-                -DCMAKE_BUILD_TYPE=Release
-          cmake --build . \
-                --config Release
-          ./consumer
-        working-directory: _build/tests/cmake-consumer-dist
-
-  # ---------------------------------------------------------------------------
-  # MacOS latest ext packages
-  # ---------------------------------------------------------------------------
-
-  macos-latest:
-    name: 'macOS latest 
-      <AppleClang 12.0 
-       cxx=${{ matrix.cxx-standard }}, 
-       docs=${{ matrix.build-docs }}, 
-       python=${{ matrix.python-version }}>'
-    # Don't run on OCIO forks
-    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
-    runs-on: macos-latest
-    strategy:
-      matrix:
-        build: [1, 2]
-        include:
-          # C++17
-          - build: 1
-            build-docs: 'ON'
-            build-openfx: 'ON'
-            cxx-standard: 17
-            python-version: 3.9
-          # C++14
-          - build: 2
-            build-docs: 'ON'
-            build-openfx: 'ON'
-            cxx-standard: 14
-            python-version: 3.9
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Install docs env
-        run: share/ci/scripts/macos/install_docs_env.sh
-        if: matrix.build-docs == 'ON'
-      - name: Install tests env
-        run: share/ci/scripts/macos/install_tests_env.sh
-      - name: Setup ext environment
-        run: |
-          EXT_PATH=/usr/local
-          echo "EXT_PATH=$EXT_PATH" >> $GITHUB_ENV
-      - name: Install indirect dependencies
-        run: |
-          share/ci/scripts/macos/install_bison.sh latest
-          share/ci/scripts/macos/install_boost.sh latest
-          share/ci/scripts/multi/install_pugixml.sh latest $EXT_PATH
-      - name: Install fixed ext package versions
-        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
-        run: |
-          share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
-          share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
-          share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
-          share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
-          share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
-          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
-          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
-      - name: Install latest ext package versions
-        run: |
-          share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_openexr.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_oiio.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_osl.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_openfx.sh latest $EXT_PATH
-      - name: Create build directories
-        run: |
-          mkdir _install
-          mkdir _build
-      - name: Configure
-        run: |
-          cmake ../. \
-                -DCMAKE_INSTALL_PREFIX=../_install \
-                -DCMAKE_BUILD_TYPE=Release \
-                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
-                -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
-                -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
-                -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_INSTALL_EXT_PACKAGES=NONE \
-                -DOCIO_WARNING_AS_ERROR=OFF \
-                -DPython_EXECUTABLE=$(which python) \
-                -DOCIO_USE_OIIO_CMAKE_CONFIG=ON
-        working-directory: _build
-      - name: Build
-        run: |
-          cmake --build . \
-                --target install \
-                --config Release \
-                -- -j$(sysctl -n hw.ncpu)
-        working-directory: _build
-      - name: Test
-        run: ctest -V -C Release
-        working-directory: _build
-      - name: Test CMake Consumer
-        run: |
-          cmake . \
-                -DCMAKE_PREFIX_PATH=../../../_install \
-                -DCMAKE_BUILD_TYPE=Release
-          cmake --build . \
-                --config Release
-          ./consumer
-        working-directory: _build/tests/cmake-consumer-dist
-
-
-  # ---------------------------------------------------------------------------
-  # Windows latest ext packages
-  # ---------------------------------------------------------------------------
-
-  windows-latest:
-    name: 'Windows latest 
-      <MSVC 
-       cxx=${{ matrix.cxx-standard }}, 
-       docs=${{ matrix.build-docs }}, 
-       python=${{ matrix.python-version }}>'
-    # Don't run on OCIO forks
-    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
-    runs-on: windows-latest
-    strategy:
-      matrix:
-        build: [1, 2]
-        include:
-          # C++17
-          - build: 1
-            build-docs: 'ON'
-            build-openfx: 'ON'
-            cxx-standard: 17
-            python-version: 3.9
-          # C++14
-          - build: 2
-            build-docs: 'ON'
-            build-openfx: 'ON'
-            cxx-standard: 14
-            python-version: 3.9
-    steps:
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Install docs env
-        run: share/ci/scripts/windows/install_docs_env.sh
-        shell: bash
-        if: matrix.build-docs == 'ON'
-      - name: Install tests env
-        run: share/ci/scripts/windows/install_tests_env.sh
-        shell: bash
-      - name: Setup ext environment
-        run: |
-          EXT_PATH=$GITHUB_WORKSPACE/_ext
-          VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT/installed/x64-windows
-          echo "EXT_PATH=$EXT_PATH" >> $GITHUB_ENV
-          echo "CMAKE_PREFIX_PATH=$VCPKG_ROOT;$EXT_PATH" >> $GITHUB_ENV
-          echo "$VCPKG_ROOT/bin" >> $GITHUB_PATH
-          echo "$EXT_PATH/bin" >> $GITHUB_PATH
-          mkdir $EXT_PATH
-        shell: bash
-      - name: Install indirect dependencies
-        run: |
-          vcpkg install zlib:x64-windows
-          vcpkg install tiff:x64-windows
-          vcpkg install boost-asio:x64-windows
-          vcpkg install boost-container:x64-windows
-          vcpkg install boost-filesystem:x64-windows
-          vcpkg install boost-math:x64-windows
-          vcpkg install boost-stacktrace:x64-windows
-          vcpkg install boost-system:x64-windows
-          vcpkg install boost-thread:x64-windows
-          share/ci/scripts/multi/install_pugixml.sh latest $EXT_PATH
-        shell: bash
-      - name: Install fixed ext package versions
-        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
-        run: |
-          share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
-          share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
-          share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
-          share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
-          share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
-          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
-          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
-        shell: bash
-      # OSL not installed due to LLVM compilation time.
-      - name: Install latest ext package versions
-        run: |
-          share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_openexr.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_oiio.sh latest $EXT_PATH
-          share/ci/scripts/multi/install_openfx.sh latest $EXT_PATH
-        shell: bash
-      - name: Create build directories
-        run: |
-          mkdir _install
-          mkdir _build
-        shell: bash
-      - name: Configure
-        run: |
-          cmake ../. \
-                -DCMAKE_INSTALL_PREFIX=../_install \
-                -DCMAKE_BUILD_TYPE=Release \
-                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
-                -DCMAKE_GENERATOR_PLATFORM=x64 \
-                -DOCIO_BUILD_DOCS=OFF \
-                -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
-                -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_INSTALL_EXT_PACKAGES=NONE \
-                -DOCIO_WARNING_AS_ERROR=OFF \
-                -DPython_EXECUTABLE=$(which python) \
-                -DOCIO_BUILD_PYTHON=OFF \
-                -DOCIO_USE_OIIO_CMAKE_CONFIG=ON
-        shell: bash
-        working-directory: _build
-      - name: Build
-        run: |
-          cmake --build . \
-                --target install \
-                --config Release \
-                --parallel
-        shell: bash
-        working-directory: _build
-      - name: Test
-        run: |
-          ctest -V -C Release
-        shell: bash
-        working-directory: _build
-      - name: Test CMake Consumer
-        run: |
-          cmake . \
-                -DCMAKE_PREFIX_PATH=../../../_install \
-                -DCMAKE_BUILD_TYPE=Release
-          cmake --build . \
-                --config Release
-          export PATH=../../../_install/bin:$PATH
-          ./Release/consumer
-        shell: bash
-        working-directory: _build/tests/cmake-consumer-dist
-
-
   # ---------------------------------------------------------------------------
   # SonarCloud static analysis
   # ---------------------------------------------------------------------------
diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
index ff87ed3c47..bf98506e1a 100644
--- a/.github/workflows/ci_workflow.yml
+++ b/.github/workflows/ci_workflow.yml
@@ -60,13 +60,12 @@ jobs:
       image: aswf/ci-ocio:${{ matrix.vfx-cy }}
     strategy:
       matrix:
-        build: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+        build: [1, 2, 3, 4, 5, 6, 7, 8, 9]
         include:
           # -------------------------------------------------------------------
           # VFX CY2022 (Python 3.9)
           # -------------------------------------------------------------------
-          # C++17, Clang, Debug, OpenFX
-          - build: 13
+          - build: 9
             build-type: Debug
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -76,36 +75,21 @@ jobs:
             cxx-standard: 17
             cxx-compiler: clang++
             cc-compiler: clang
-            compiler-desc: Clang 9
+            compiler-desc: Clang
             vfx-cy: 2022
-          # C++17, GCC, no SSE, OpenFX
-          - build: 12
+          - build: 8
             build-type: Release
             build-shared: 'ON'
-            build-docs: 'OFF'
+            build-docs: 'ON'
             build-openfx: 'ON'
             use-sse: 'OFF'
             use-oiio: 'OFF'
             cxx-standard: 17
             cxx-compiler: g++
             cc-compiler: gcc
-            compiler-desc: GCC 9.3.1
+            compiler-desc: GCC
             vfx-cy: 2022
-          # C++14, GCC, static, docs
-          - build: 11
-            build-type: Release
-            build-shared: 'OFF'
-            build-docs: 'ON'
-            build-openfx: 'OFF'
-            use-sse: 'ON'
-            use-oiio: 'OFF'
-            cxx-standard: 14
-            cxx-compiler: g++
-            cc-compiler: gcc
-            compiler-desc: GCC 9.3.1
-            vfx-cy: 2022
-          # C++11, GCC, static
-          - build: 10
+          - build: 7
             build-type: Release
             build-shared: 'OFF'
             build-docs: 'OFF'
@@ -115,13 +99,12 @@ jobs:
             cxx-standard: 11
             cxx-compiler: g++
             cc-compiler: gcc
-            compiler-desc: GCC 9.3.1
+            compiler-desc: GCC
             vfx-cy: 2022
           # -------------------------------------------------------------------
           # VFX CY2021 (Python 3.7)
           # -------------------------------------------------------------------
-          # C++17, Clang
-          - build: 9
+          - build: 6
             build-type: Release
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -131,10 +114,9 @@ jobs:
             cxx-standard: 17
             cxx-compiler: clang++
             cc-compiler: clang
-            compiler-desc: Clang 9
+            compiler-desc: Clang
             vfx-cy: 2021
-          # C++14, Clang, static, no SSE, OpenFX
-          - build: 8
+          - build: 5
             build-type: Release
             build-shared: 'OFF'
             build-docs: 'OFF'
@@ -144,10 +126,9 @@ jobs:
             cxx-standard: 14
             cxx-compiler: clang++
             cc-compiler: clang
-            compiler-desc: Clang 9
+            compiler-desc: Clang
             vfx-cy: 2021
-          # C++11, GCC, Debug
-          - build: 7
+          - build: 4
             build-type: Debug
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -157,13 +138,12 @@ jobs:
             cxx-standard: 11
             cxx-compiler: g++
             cc-compiler: gcc
-            compiler-desc: GCC 9.3.1
+            compiler-desc: GCC
             vfx-cy: 2021
           # -------------------------------------------------------------------
           # VFX CY2020 (Python 3.7)
           # -------------------------------------------------------------------
-          # C++14, Clang, OpenFX
-          - build: 6
+          - build: 3
             build-type: Release
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -173,10 +153,9 @@ jobs:
             cxx-standard: 14
             cxx-compiler: clang++
             cc-compiler: clang
-            compiler-desc: Clang 7
+            compiler-desc: Clang
             vfx-cy: 2020
-          # C++14, GCC, static, Debug, OpenFX
-          - build: 5
+          - build: 2
             build-type: Debug
             build-shared: 'OFF'
             build-docs: 'OFF'
@@ -186,10 +165,9 @@ jobs:
             cxx-standard: 14
             cxx-compiler: g++
             cc-compiler: gcc
-            compiler-desc: GCC 6.3.1
+            compiler-desc: GCC
             vfx-cy: 2020
-          # C++11, GCC, no SSE
-          - build: 4
+          - build: 1
             build-type: Release
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -199,63 +177,14 @@ jobs:
             cxx-standard: 11
             cxx-compiler: g++
             cc-compiler: gcc
-            compiler-desc: GCC 6.3.1
+            compiler-desc: GCC
             vfx-cy: 2020
-          # -------------------------------------------------------------------
-          # VFX CY2019 (Python 2.7)
-          # -------------------------------------------------------------------
-          # C++11, Clang, static
-          - build: 3
-            build-type: Release
-            build-shared: 'OFF'
-            # Doc build requires Python 3
-            build-docs: 'OFF'
-            build-openfx: 'OFF'
-            use-sse: 'ON'
-            use-oiio: 'ON'
-            cxx-standard: 11
-            cxx-compiler: clang++
-            cc-compiler: clang
-            compiler-desc: Clang 7
-            vfx-cy: 2019
-          # C++11, Clang, Debug, no SSE
-          - build: 2
-            build-type: Debug
-            build-shared: 'ON'
-            # Doc build requires Python 3
-            build-docs: 'OFF'
-            build-openfx: 'OFF'
-            use-sse: 'OFF'
-            use-oiio: 'OFF'
-            cxx-standard: 11
-            cxx-compiler: clang++
-            cc-compiler: clang
-            compiler-desc: Clang 7
-            vfx-cy: 2019
-          # C++11, GCC, OpenFX
-          - build: 1
-            build-type: Release
-            build-shared: 'ON'
-            # Doc build requires Python 3
-            build-docs: 'OFF'
-            build-openfx: 'ON'
-            use-sse: 'ON'
-            use-oiio: 'OFF'
-            cxx-standard: 11
-            cxx-compiler: g++
-            cc-compiler: gcc
-            compiler-desc: GCC 6.3.1
-            vfx-cy: 2019
     env:
       CXX: ${{ matrix.cxx-compiler }}
       CC: ${{ matrix.cc-compiler }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
-      # minizip-ng requires CMake 3.13+ but VFX2019 image ships with 3.12
-      - name: Upgrade VFX2019 CMake
-        run: pip install cmake==3.13.3
-        if: matrix.vfx-cy == '2019'
       - name: Install docs env
         run: share/ci/scripts/linux/yum/install_docs_env.sh
         if: matrix.build-docs == 'ON'
@@ -337,7 +266,7 @@ jobs:
 
   macos:
     name: 'macOS 11
-      <AppleClang 11.0 
+      <AppleClang 
        config=${{ matrix.build-type }}, 
        shared=${{ matrix.build-shared }}, 
        sse=${{ matrix.use-sse }}, 
@@ -352,10 +281,9 @@ jobs:
     runs-on: macos-11
     strategy:
       matrix:
-        build: [1, 2, 3, 4, 5, 6]
+        build: [1, 2, 3, 4]
         include:
-          # C++17, OpenFX
-          - build: 6
+          - build: 4
             build-type: Release
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -363,19 +291,17 @@ jobs:
             use-sse: 'ON'
             use-oiio: 'ON'
             cxx-standard: 17
-            python-version: 3.9
-          # C++11, Python 3.7
-          - build: 5
+            python-version: '3.11'
+          - build: 3
             build-type: Release
             build-shared: 'ON'
             build-docs: 'ON'
             build-openfx: 'OFF'
-            use-sse: 'ON'
+            use-sse: 'OFF'
             use-oiio: 'OFF'
             cxx-standard: 11
-            python-version: 3.7
-          # Debug, OpenFX
-          - build: 4
+            python-version: '3.10'
+          - build: 2
             build-type: Debug
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -383,38 +309,16 @@ jobs:
             use-sse: 'ON'
             use-oiio: 'OFF'
             cxx-standard: 11
-            python-version: 3.7
-          # C++14, OpenEXR, OpenFX
-          - build: 3
+            python-version: '3.9'
+          - build: 1
             build-type: Release
-            build-shared: 'ON'
+            build-shared: 'OFF'
             build-docs: 'OFF'
             build-openfx: 'ON'
             use-sse: 'ON'
             use-oiio: 'OFF'
             cxx-standard: 14
-            python-version: 3.7
-          # Static, no SSE
-          - build: 2
-            build-type: Release
-            build-shared: 'OFF'
-            build-docs: 'OFF'
-            build-openfx: 'OFF'
-            use-sse: 'OFF'
-            use-oiio: 'OFF'
-            cxx-standard: 11
-            python-version: 3.7
-          # Python 2.7
-          - build: 1
-            build-type: Release
-            build-shared: 'ON'
-            # Doc build requires Python 3
-            build-docs: 'OFF'
-            build-openfx: 'OFF'
-            use-sse: 'ON'
-            use-oiio: 'ON'
-            cxx-standard: 11
-            python-version: 2.7
+            python-version: '3.7'
     steps:
       - name: Setup Python
         uses: actions/setup-python@v4
@@ -505,7 +409,7 @@ jobs:
 
   windows:
     name: 'Windows 2019 
-      <MSVC 16.4 
+      <MSVC 
        config=${{ matrix.build-type }}, 
        shared=${{ matrix.build-shared }}, 
        sse=${{ matrix.use-sse }}, 
@@ -520,30 +424,27 @@ jobs:
     runs-on: windows-2019
     strategy:
       matrix:
-        build: [1, 2, 3, 4, 5, 6]
+        build: [1, 2, 3, 4]
         include:
-          # C++17, OpenFX
-          - build: 6
+          - build: 4
             build-type: Release
             build-shared: 'ON'
             build-docs: 'OFF'
             build-openfx: 'ON'
-            use-sse: 'ON'
+            use-sse: 'OFF'
             use-oiio: 'ON'
             cxx-standard: 17
-            python-version: 3.9
-          # C++11, Python 3.7
-          - build: 5
+            python-version: '3.11'
+          - build: 3
             build-type: Release
-            build-shared: 'ON'
+            build-shared: 'OFF'
             build-docs: 'OFF'
             build-openfx: 'OFF'
             use-sse: 'ON'
             use-oiio: 'OFF'
             cxx-standard: 11
-            python-version: 3.7
-          # Debug
-          - build: 4
+            python-version: '3.9'
+          - build: 2
             build-type: Debug
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -551,9 +452,9 @@ jobs:
             use-sse: 'ON'
             use-oiio: 'OFF'
             cxx-standard: 11
-            python-version: 3.7
+            python-version: '3.8'
           # C++14, OpenEXR, OpenFX
-          - build: 3
+          - build: 1
             build-type: Release
             build-shared: 'ON'
             build-docs: 'OFF'
@@ -561,28 +462,7 @@ jobs:
             use-sse: 'ON'
             use-oiio: 'OFF'
             cxx-standard: 14
-            python-version: 3.7
-          # Static, no SSE, OpenFX
-          - build: 2
-            build-type: Release
-            build-shared: 'OFF'
-            build-docs: 'OFF'
-            build-openfx: 'ON'
-            use-sse: 'OFF'
-            use-oiio: 'OFF'
-            cxx-standard: 11
-            python-version: 3.7
-          # Python 2.7
-          - build: 1
-            build-type: Release
-            build-shared: 'ON'
-            # Doc build requires Python 3
-            build-docs: 'OFF'
-            build-openfx: 'OFF'
-            use-sse: 'ON'
-            use-oiio: 'ON'
-            cxx-standard: 11
-            python-version: 2.7
+            python-version: '3.7'
     steps:
       - name: Setup Python
         uses: actions/setup-python@v4
diff --git a/.github/workflows/dependencies_latest.yml b/.github/workflows/dependencies_latest.yml
new file mode 100644
index 0000000000..fe729a4fec
--- /dev/null
+++ b/.github/workflows/dependencies_latest.yml
@@ -0,0 +1,388 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# GitHub Actions workflow file
+# https://help.github.com/en/actions/reference/workflow-syntax-for-github-actions
+
+name: Dependencies latest
+
+on:
+  pull_request:
+    branches-ignore:
+      - RB-0.*
+      - RB-1.*
+      - gh-pages
+    tags-ignore:
+      - v0.*
+      - v1.*
+    paths:
+      - .github/workflows/dependencies_latest.yml
+  schedule:
+    # Nightly build
+    - cron: "0 0 * * *"
+
+jobs:
+  # ---------------------------------------------------------------------------
+  # Linux latest ext packages
+  # ---------------------------------------------------------------------------
+
+  linux_latest:
+    name: 'Linux CentOS 7 VFX CY${{ matrix.vfx-cy }} latest 
+      <${{ matrix.compiler-desc }} 
+       cxx=${{ matrix.cxx-standard }}, 
+       docs=${{ matrix.build-docs }}>'
+    # Don't run on OCIO forks
+    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
+    # GH-hosted VM. The build runs in CentOS 7 'container' defined below.
+    runs-on: ubuntu-latest
+    container:
+      # DockerHub: https://hub.docker.com/u/aswf
+      # Source: https://github.com/AcademySoftwareFoundation/aswf-docker
+      image: aswf/ci-base:${{ matrix.vfx-cy }}
+    strategy:
+      matrix:
+        build: [1, 2, 3, 4]
+        include:
+          # -------------------------------------------------------------------
+          # GCC
+          # -------------------------------------------------------------------
+          - build: 1
+            build-docs: 'ON'
+            build-openfx: 'ON'
+            cxx-standard: 17
+            cxx-compiler: g++
+            cc-compiler: gcc
+            compiler-desc: GCC
+            vfx-cy: 2022
+          - build: 2
+            build-docs: 'OFF'
+            build-openfx: 'OFF'
+            cxx-standard: 14
+            cxx-compiler: g++
+            cc-compiler: gcc
+            compiler-desc: GCC
+            vfx-cy: 2021
+          # -------------------------------------------------------------------
+          # Clang
+          # -------------------------------------------------------------------
+          - build: 3
+            build-docs: 'OFF'
+            build-openfx: 'OFF'
+            cxx-standard: 17
+            cxx-compiler: clang++
+            cc-compiler: clang
+            compiler-desc: Clang
+            vfx-cy: 2022
+          - build: 4
+            build-docs: 'ON'
+            build-openfx: 'ON'
+            cxx-standard: 14
+            cxx-compiler: clang++
+            cc-compiler: clang
+            compiler-desc: Clang
+            vfx-cy: 2021
+    env:
+      CXX: ${{ matrix.cxx-compiler }}
+      CC: ${{ matrix.cc-compiler }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install docs env
+        run: share/ci/scripts/linux/yum/install_docs_env.sh
+        if: matrix.build-docs == 'ON'
+      - name: Install tests env
+        run: share/ci/scripts/linux/yum/install_tests_env.sh
+      - name: Setup ext environment
+        run: |
+          EXT_PATH=/usr/local
+          echo "EXT_PATH=$EXT_PATH" >> $GITHUB_ENV
+      - name: Install indirect dependencies
+        run: |
+          share/ci/scripts/multi/install_pugixml.sh latest
+      - name: Install fixed ext package versions
+        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
+        run: |
+          share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
+          share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
+          share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
+          share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
+          share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
+          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
+          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
+      - name: Install latest ext package versions
+        run: |
+          share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_openexr.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_oiio.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_osl.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_openfx.sh latest $EXT_PATH
+      - name: Create build directories
+        run: |
+          mkdir _install
+          mkdir _build
+      - name: Configure
+        run: |
+          cmake ../. \
+                -DCMAKE_INSTALL_PREFIX=../_install \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
+                -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
+                -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
+                -DOCIO_BUILD_GPU_TESTS=OFF \
+                -DOCIO_INSTALL_EXT_PACKAGES=NONE \
+                -DOCIO_WARNING_AS_ERROR=OFF \
+                -DPython_EXECUTABLE=$(which python) \
+                -DOCIO_USE_OIIO_CMAKE_CONFIG=ON
+        working-directory: _build
+      - name: Build
+        run: |
+          cmake --build . \
+                --target install \
+                --config Release \
+                -- -j$(nproc)
+        working-directory: _build
+      - name: Test
+        run: ctest -V -C Release
+        working-directory: _build
+      - name: Test CMake Consumer
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=Release
+          cmake --build . \
+                --config Release
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
+
+  # ---------------------------------------------------------------------------
+  # MacOS latest ext packages
+  # ---------------------------------------------------------------------------
+
+  macos-latest:
+    name: 'macOS latest 
+      <AppleClang 
+       cxx=${{ matrix.cxx-standard }}, 
+       docs=${{ matrix.build-docs }}, 
+       python=${{ matrix.python-version }}>'
+    # Don't run on OCIO forks
+    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        build: [1, 2]
+        include:
+          - build: 1
+            build-docs: 'ON'
+            build-openfx: 'ON'
+            cxx-standard: 17
+            python-version: '3.11'
+          - build: 2
+            build-docs: 'ON'
+            build-openfx: 'ON'
+            cxx-standard: 14
+            python-version: '3.9'
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install docs env
+        run: share/ci/scripts/macos/install_docs_env.sh
+        if: matrix.build-docs == 'ON'
+      - name: Install tests env
+        run: share/ci/scripts/macos/install_tests_env.sh
+      - name: Setup ext environment
+        run: |
+          EXT_PATH=/usr/local
+          echo "EXT_PATH=$EXT_PATH" >> $GITHUB_ENV
+      - name: Install indirect dependencies
+        run: |
+          share/ci/scripts/macos/install_bison.sh latest
+          share/ci/scripts/macos/install_boost.sh latest
+          share/ci/scripts/multi/install_pugixml.sh latest $EXT_PATH
+      - name: Install fixed ext package versions
+        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
+        run: |
+          share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
+          share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
+          share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
+          share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
+          share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
+          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
+          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
+      - name: Install latest ext package versions
+        run: |
+          share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_openexr.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_oiio.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_osl.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_openfx.sh latest $EXT_PATH
+      - name: Create build directories
+        run: |
+          mkdir _install
+          mkdir _build
+      - name: Configure
+        run: |
+          cmake ../. \
+                -DCMAKE_INSTALL_PREFIX=../_install \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
+                -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
+                -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
+                -DOCIO_BUILD_GPU_TESTS=OFF \
+                -DOCIO_INSTALL_EXT_PACKAGES=NONE \
+                -DOCIO_WARNING_AS_ERROR=OFF \
+                -DPython_EXECUTABLE=$(which python) \
+                -DOCIO_USE_OIIO_CMAKE_CONFIG=ON
+        working-directory: _build
+      - name: Build
+        run: |
+          cmake --build . \
+                --target install \
+                --config Release \
+                -- -j$(sysctl -n hw.ncpu)
+        working-directory: _build
+      - name: Test
+        run: ctest -V -C Release
+        working-directory: _build
+      - name: Test CMake Consumer
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=Release
+          cmake --build . \
+                --config Release
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
+
+  # ---------------------------------------------------------------------------
+  # Windows latest ext packages
+  # ---------------------------------------------------------------------------
+
+  windows-latest:
+    name: 'Windows latest 
+      <MSVC 
+       cxx=${{ matrix.cxx-standard }}, 
+       docs=${{ matrix.build-docs }}, 
+       python=${{ matrix.python-version }}>'
+    # Don't run on OCIO forks
+    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        build: [1, 2]
+        include:
+          - build: 1
+            build-docs: 'ON'
+            build-openfx: 'ON'
+            cxx-standard: 17
+            python-version: '3.11'
+          - build: 2
+            build-docs: 'ON'
+            build-openfx: 'ON'
+            cxx-standard: 14
+            python-version: '3.9'
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install docs env
+        run: share/ci/scripts/windows/install_docs_env.sh
+        shell: bash
+        if: matrix.build-docs == 'ON'
+      - name: Install tests env
+        run: share/ci/scripts/windows/install_tests_env.sh
+        shell: bash
+      - name: Setup ext environment
+        run: |
+          EXT_PATH=$GITHUB_WORKSPACE/_ext
+          VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT/installed/x64-windows
+          echo "EXT_PATH=$EXT_PATH" >> $GITHUB_ENV
+          echo "CMAKE_PREFIX_PATH=$VCPKG_ROOT;$EXT_PATH" >> $GITHUB_ENV
+          echo "$VCPKG_ROOT/bin" >> $GITHUB_PATH
+          echo "$EXT_PATH/bin" >> $GITHUB_PATH
+          mkdir $EXT_PATH
+        shell: bash
+      - name: Install indirect dependencies
+        run: |
+          vcpkg install zlib:x64-windows
+          vcpkg install tiff:x64-windows
+          vcpkg install boost-asio:x64-windows
+          vcpkg install boost-container:x64-windows
+          vcpkg install boost-filesystem:x64-windows
+          vcpkg install boost-math:x64-windows
+          vcpkg install boost-stacktrace:x64-windows
+          vcpkg install boost-system:x64-windows
+          vcpkg install boost-thread:x64-windows
+          share/ci/scripts/multi/install_pugixml.sh latest $EXT_PATH
+        shell: bash
+      - name: Install fixed ext package versions
+        # Minizip-ng depends on ZLIB. ZLIB must be installed first.
+        run: |
+          share/ci/scripts/multi/install_lcms2.sh 2.2 $EXT_PATH
+          share/ci/scripts/multi/install_yaml-cpp.sh 0.7.0 $EXT_PATH
+          share/ci/scripts/multi/install_pystring.sh 1.1.3 $EXT_PATH
+          share/ci/scripts/multi/install_pybind11.sh 2.9.2 $EXT_PATH
+          share/ci/scripts/multi/install_expat.sh 2.4.1 $EXT_PATH
+          share/ci/scripts/multi/install_zlib.sh 1.2.12 $EXT_PATH
+          share/ci/scripts/multi/install_minizip-ng.sh 3.0.6 $EXT_PATH
+        shell: bash
+      # OSL not installed due to LLVM compilation time.
+      - name: Install latest ext package versions
+        run: |
+          share/ci/scripts/multi/install_imath.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_openexr.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_oiio.sh latest $EXT_PATH
+          share/ci/scripts/multi/install_openfx.sh latest $EXT_PATH
+        shell: bash
+      - name: Create build directories
+        run: |
+          mkdir _install
+          mkdir _build
+        shell: bash
+      - name: Configure
+        run: |
+          cmake ../. \
+                -DCMAKE_INSTALL_PREFIX=../_install \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
+                -DCMAKE_GENERATOR_PLATFORM=x64 \
+                -DOCIO_BUILD_DOCS=OFF \
+                -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
+                -DOCIO_BUILD_GPU_TESTS=OFF \
+                -DOCIO_INSTALL_EXT_PACKAGES=NONE \
+                -DOCIO_WARNING_AS_ERROR=OFF \
+                -DPython_EXECUTABLE=$(which python) \
+                -DOCIO_BUILD_PYTHON=OFF \
+                -DOCIO_USE_OIIO_CMAKE_CONFIG=ON
+        shell: bash
+        working-directory: _build
+      - name: Build
+        run: |
+          cmake --build . \
+                --target install \
+                --config Release \
+                --parallel
+        shell: bash
+        working-directory: _build
+      - name: Test
+        run: |
+          ctest -V -C Release
+        shell: bash
+        working-directory: _build
+      - name: Test CMake Consumer
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=Release
+          cmake --build . \
+                --config Release
+          export PATH=../../../_install/bin:$PATH
+          ./Release/consumer
+        shell: bash
+        working-directory: _build/tests/cmake-consumer-dist
diff --git a/.github/workflows/platform_latest.yml b/.github/workflows/platform_latest.yml
new file mode 100644
index 0000000000..9d4d04183b
--- /dev/null
+++ b/.github/workflows/platform_latest.yml
@@ -0,0 +1,396 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# GitHub Actions workflow file
+# https://help.github.com/en/actions/reference/workflow-syntax-for-github-actions
+
+name: Platform latest
+
+on:
+  pull_request:
+    branches-ignore:
+      - RB-0.*
+      - RB-1.*
+      - gh-pages
+    tags-ignore:
+      - v0.*
+      - v1.*
+    paths:
+      - .github/workflows/platform_latest.yml
+  schedule:
+    # Nightly build
+    - cron: "0 0 * * *"
+
+jobs:
+  # ---------------------------------------------------------------------------
+  # Linux latest ext packages
+  # ---------------------------------------------------------------------------
+
+  linux_latest:
+    name: 'Linux Ubuntu latest 
+      <${{ matrix.compiler-desc }} 
+       config=${{ matrix.build-type }}, 
+       shared=${{ matrix.build-shared }}, 
+       cxx=${{ matrix.cxx-standard }}>'
+    # Don't run on OCIO forks
+    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        build: [1, 2, 3, 4]
+        include:
+          # -------------------------------------------------------------------
+          # GCC
+          # -------------------------------------------------------------------
+          - build: 1
+            build-python: ON
+            build-type: Release
+            build-shared: ON
+            cxx-standard: 20
+            cxx-compiler: g++
+            cc-compiler: gcc
+            compiler-desc: GCC
+            enable-sanitizer: OFF
+          - build: 2
+            build-python: OFF
+            build-type: Debug
+            build-shared: ON
+            cxx-standard: 20
+            cxx-compiler: g++
+            cc-compiler: gcc
+            compiler-desc: GCC
+            enable-sanitizer: ON
+          # -------------------------------------------------------------------
+          # Clang
+          # -------------------------------------------------------------------
+          - build: 3
+            build-python: ON
+            build-type: Release
+            build-shared: ON
+            cxx-standard: 20
+            cxx-compiler: clang++
+            cc-compiler: clang
+            compiler-desc: Clang
+            enable-sanitizer: OFF
+          - build: 4
+            build-python: OFF
+            build-type: Debug
+            build-shared: ON
+            cxx-standard: 20
+            cxx-compiler: clang++
+            cc-compiler: clang
+            compiler-desc: Clang
+            enable-sanitizer: ON
+    env:
+      CXX: ${{ matrix.cxx-compiler }}
+      CC: ${{ matrix.cc-compiler }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install tests env
+        run: share/ci/scripts/linux/yum/install_tests_env.sh
+      - name: Create build directories
+        run: |
+          mkdir _install
+          mkdir _build
+      - name: Configure
+        run: |
+          cmake ../. \
+                -DCMAKE_INSTALL_PREFIX=../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
+                -DBUILD_SHARED_LIBS=${{ matrix.build-shared }} \
+                -DOCIO_BUILD_DOCS=OFF \
+                -DOCIO_BUILD_OPENFX=ON \
+                -DOCIO_BUILD_GPU_TESTS=OFF \
+                -DOCIO_BUILD_PYTHON=${{ matrix.build-python}} \
+                -DOCIO_USE_SSE=ON \
+                -DOCIO_USE_OIIO_FOR_APPS=OFF \
+                -DOCIO_INSTALL_EXT_PACKAGES=ALL \
+                -DOCIO_WARNING_AS_ERROR=ON \
+                -DOCIO_ENABLE_SANITIZER=${{ matrix.enable-sanitizer }} \
+                -DPython_EXECUTABLE=$(which python)
+        working-directory: _build
+      - name: Build
+        run: |
+          cmake --build . \
+                --target install \
+                --config ${{ matrix.build-type }} \
+                -v \
+                -- -j$(nproc)
+          echo "ocio_build_path=$(pwd)" >> $GITHUB_ENV
+        working-directory: _build
+      - name: Test
+        run: ctest -V -C ${{ matrix.build-type }}
+        working-directory: _build
+      - name: Test CMake Consumer with shared OCIO
+        if: matrix.build-shared == 'ON'
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -DOCIO_ENABLE_SANITIZER=${{ matrix.enable-sanitizer }}
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
+      - name: Test CMake Consumer with static OCIO
+        if: matrix.build-shared == 'OFF'
+        # The yaml-cpp_VERSION is set below because Findyaml-cpp.cmake needs it but is unable to 
+        # extract it from the headers, like the other modules.
+        #
+        # Prefer the static version of each dependencies by using <pkg>_STATIC_LIBRARY. 
+        # Alternatively, this can be done by setting <pkg>_LIBRARY and <pkg>_INCLUDE_DIR to 
+        # the static version of the package.
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -Dexpat_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dexpat_STATIC_LIBRARY=ON \
+                -DImath_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DImath_STATIC_LIBRARY=ON \
+                -Dpystring_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_STATIC_LIBRARY=ON \
+                -Dyaml-cpp_VERSION=0.7.0 \
+                -DZLIB_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DZLIB_STATIC_LIBRARY=ON \
+                -Dminizip-ng_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dminizip-ng_STATIC_LIBRARY=ON
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
+
+  # ---------------------------------------------------------------------------
+  # MacOS latest ext packages
+  # ---------------------------------------------------------------------------
+
+  macos-latest:
+    name: 'macOS latest 
+      <AppleClang 
+       config=${{ matrix.build-type }}, 
+       shared=${{ matrix.build-shared }}, 
+       cxx=${{ matrix.cxx-standard }}, 
+       python=${{ matrix.python-version }}>'
+    # Don't run on OCIO forks
+    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        build: [1, 2]
+        include:
+          - build: 1
+            build-python: ON
+            build-type: Release
+            build-shared: ON
+            cxx-standard: 20
+            enable-sanitizer: OFF
+            python-version: '3.11'
+          - build: 2
+            build-python: OFF
+            build-type: Debug
+            build-shared: ON
+            cxx-standard: 20
+            enable-sanitizer: ON
+            python-version: '3.11'
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install tests env
+        run: share/ci/scripts/macos/install_tests_env.sh
+      - name: Create build directories
+        run: |
+          mkdir _install
+          mkdir _build
+      - name: Configure
+        run: |
+          cmake ../. \
+                -DCMAKE_INSTALL_PREFIX=../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
+                -DBUILD_SHARED_LIBS=${{ matrix.build-shared }} \
+                -DOCIO_BUILD_DOCS=OFF \
+                -DOCIO_BUILD_OPENFX=ON \
+                -DOCIO_BUILD_GPU_TESTS=OFF \
+                -DOCIO_BUILD_PYTHON=${{ matrix.build-python}} \
+                -DOCIO_USE_SSE=ON \
+                -DOCIO_USE_OIIO_FOR_APPS=OFF \
+                -DOCIO_INSTALL_EXT_PACKAGES=ALL \
+                -DOCIO_WARNING_AS_ERROR=ON \
+                -DOCIO_ENABLE_SANITIZER=${{ matrix.enable-sanitizer }} \
+                -DPython_EXECUTABLE=$(which python)
+        working-directory: _build
+      - name: Build
+        run: |
+          cmake --build . \
+                --target install \
+                --config ${{ matrix.build-type }} \
+                -- -j$(sysctl -n hw.ncpu)
+          echo "ocio_build_path=$(pwd)" >> $GITHUB_ENV
+        working-directory: _build
+      - name: Test
+        run: ctest -V -C ${{ matrix.build-type }}
+        working-directory: _build
+      - name: Test CMake Consumer with shared OCIO
+        if: matrix.build-shared == 'ON'
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
+      - name: Test CMake Consumer with static OCIO
+        if: matrix.build-shared == 'OFF'
+        # The yaml-cpp_VERSION is set below because Findyaml-cpp.cmake needs it but is unable to 
+        # extract it from the headers, like the other modules.
+        #
+        # Prefer the static version of each dependencies by using <pkg>_STATIC_LIBRARY. 
+        # Alternatively, this can be done by setting <pkg>_LIBRARY and <pkg>_INCLUDE_DIR to 
+        # the static version of the package.       
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -Dexpat_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dexpat_STATIC_LIBRARY=ON \
+                -DImath_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DImath_STATIC_LIBRARY=ON \
+                -Dpystring_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_STATIC_LIBRARY=ON \
+                -Dyaml-cpp_VERSION=0.7.0 \
+                -DZLIB_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DZLIB_STATIC_LIBRARY=ON \
+                -Dminizip-ng_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dminizip-ng_STATIC_LIBRARY=ON
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          ./consumer
+        working-directory: _build/tests/cmake-consumer-dist
+
+  # ---------------------------------------------------------------------------
+  # Windows latest ext packages
+  # ---------------------------------------------------------------------------
+
+  windows-latest:
+    name: 'Windows latest 
+      <MSVC 
+       config=${{ matrix.build-type }}, 
+       shared=${{ matrix.build-shared }}, 
+       cxx=${{ matrix.cxx-standard }}, 
+       python=${{ matrix.python-version }}>'
+    # Don't run on OCIO forks
+    if: github.repository == 'AcademySoftwareFoundation/OpenColorIO'
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        build: [1, 2]
+        include:
+          - build: 1
+            build-python: ON
+            build-type: Release
+            build-shared: ON
+            cxx-standard: 20
+            python-version: '3.11'
+          - build: 2
+            build-python: ON
+            build-type: Debug
+            build-shared: ON
+            cxx-standard: 20
+            python-version: '3.11'
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install tests env
+        run: share/ci/scripts/windows/install_tests_env.sh
+        shell: bash
+      - name: Create build directories
+        run: |
+          mkdir _install
+          mkdir _build
+        shell: bash
+      - name: Configure
+        run: |
+          cmake ../. \
+                -DCMAKE_INSTALL_PREFIX=../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -DCMAKE_CXX_STANDARD=${{ matrix.cxx-standard }} \
+                -DCMAKE_GENERATOR_PLATFORM=x64 \
+                -DBUILD_SHARED_LIBS=${{ matrix.build-shared }} \
+                -DOCIO_BUILD_DOCS=OFF \
+                -DOCIO_BUILD_OPENFX=ON \
+                -DOCIO_BUILD_GPU_TESTS=OFF \
+                -DOCIO_BUILD_PYTHON=${{ matrix.build-python}} \
+                -DOCIO_USE_SSE=ON \
+                -DOCIO_USE_OIIO_FOR_APPS=OFF \
+                -DOCIO_INSTALL_EXT_PACKAGES=ALL \
+                -DOCIO_WARNING_AS_ERROR=ON \
+                -DPython_EXECUTABLE=$(which python)
+        shell: bash
+        working-directory: _build
+      - name: Build
+        run: |
+          cmake --build . \
+                --target install \
+                --config ${{ matrix.build-type }}
+          echo "ocio_build_path=$(pwd)" >> $GITHUB_ENV
+        shell: bash
+        working-directory: _build
+      - name: Test
+        run: ctest -V -C ${{ matrix.build-type }}
+        shell: bash
+        working-directory: _build
+      - name: Test CMake Consumer with shared OCIO
+        if: matrix.build-shared == 'ON'
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          export PATH=../../../_install/bin:$PATH
+          ./${{ matrix.build-type }}/consumer
+        shell: bash
+        working-directory: _build/tests/cmake-consumer-dist
+      - name: Test CMake Consumer with static OCIO
+        if: matrix.build-shared == 'OFF'
+        # The yaml-cpp_VERSION is set below because Findyaml-cpp.cmake needs it but is unable to 
+        # extract it from the headers, like the other modules.
+        #
+        # Prefer the static version of each dependencies by using <pkg>_STATIC_LIBRARY. 
+        # Alternatively, this can be done by setting <pkg>_LIBRARY and <pkg>_INCLUDE_DIR to 
+        # the static version of the package.
+        run: |
+          cmake . \
+                -DCMAKE_PREFIX_PATH=../../../_install \
+                -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
+                -Dexpat_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dexpat_STATIC_LIBRARY=ON \
+                -DImath_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DImath_STATIC_LIBRARY=ON \
+                -Dpystring_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dyaml-cpp_STATIC_LIBRARY=ON \
+                -Dyaml-cpp_VERSION=0.7.0 \
+                -DZLIB_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -DZLIB_STATIC_LIBRARY=ON \
+                -Dminizip-ng_ROOT=${{ env.ocio_build_path }}/ext/dist \
+                -Dminizip-ng_STATIC_LIBRARY=ON
+          cmake --build . \
+                --config ${{ matrix.build-type }}
+          export PATH=../../../_install/bin:$PATH
+          ./${{ matrix.build-type }}/consumer
+        shell: bash
+        working-directory: _build/tests/cmake-consumer-dist
diff --git a/.github/workflows/wheel_workflow.yml b/.github/workflows/wheel_workflow.yml
index 737a736019..187ecb330b 100644
--- a/.github/workflows/wheel_workflow.yml
+++ b/.github/workflows/wheel_workflow.yml
@@ -128,7 +128,7 @@ jobs:
           platforms: all
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.11.2
+        uses: pypa/cibuildwheel@v2.12.0
         env:
           CIBW_BUILD: ${{ matrix.python }}
           CIBW_ARCHS: ${{ matrix.arch }}
@@ -194,7 +194,7 @@ jobs:
           python-version: '3.8'
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.11.2
+        uses: pypa/cibuildwheel@v2.12.0
         env:
           CIBW_BUILD: ${{ matrix.python }}
           CIBW_ARCHS: ${{ matrix.arch }}
@@ -245,7 +245,7 @@ jobs:
           python-version: '3.8'
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.11.2
+        uses: pypa/cibuildwheel@v2.12.0
         env:
           CIBW_BUILD: ${{ matrix.python }}
           CIBW_ARCHS: ${{ matrix.arch }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3911a154ce..e927ed47db 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 ###############################################################################
 # CMake definition.
 
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.13)
 
 set(CMAKE_MODULE_PATH
     ${CMAKE_MODULE_PATH}
@@ -137,13 +137,21 @@ set (OCIO_PYTHON_VERSION "" CACHE STRING
 
 option(OCIO_BUILD_JAVA "Specify whether to build java bindings" OFF)
 
-option(OCIO_WARNING_AS_ERROR "Set build error level for CI testing" OFF)
-
 if (WIN32)
     option(OCIO_USE_WINDOWS_UNICODE "Compile with Windows Unicode support" ON)
 endif()
 
 
+###############################################################################
+# Warnings / debugging settings
+
+option(OCIO_WARNING_AS_ERROR "Set build error level for CI testing" OFF)
+
+if (NOT WIN32)
+    option(OCIO_ENABLE_SANITIZER "Specify whether to enable compiler sanitizers (address)" OFF)
+endif()
+
+
 ###############################################################################
 # Optimization / internal linking preferences
 
diff --git a/pyproject.toml b/pyproject.toml
index 69672a6df7..ff83ac1a0d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 requires = [
     "setuptools>=42",
     "wheel",
-    "cmake>=3.12",
+    "cmake>=3.13",
     "ninja; sys_platform != 'win32' and platform_machine != 'arm64'",
     # Documentation requirements
     "six",
diff --git a/setup.cfg b/setup.cfg
index 4dc522be8a..16f06e6d16 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -6,8 +6,6 @@ classifiers =
     Intended Audience :: Developers
     Topic :: Software Development :: Libraries :: Python Modules
     Programming Language :: C++
-    Programming Language :: Python :: 2
-    Programming Language :: Python :: 2.7
     Programming Language :: Python :: 3
     Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
@@ -22,7 +20,7 @@ license_files = LICENSE
 long_description = file: README.md, LICENSE
 long_description_content_type = text/markdown
 name = opencolorio
-python_requires = '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*'
+python_requires = '>=3.7'
 url = https://opencolorio.org/
 
 [options]
diff --git a/share/cmake/modules/FindExtPackages.cmake b/share/cmake/modules/FindExtPackages.cmake
index 5455a08ce6..25cfacb497 100644
--- a/share/cmake/modules/FindExtPackages.cmake
+++ b/share/cmake/modules/FindExtPackages.cmake
@@ -49,7 +49,7 @@ find_package(pystring 1.1.3 REQUIRED)
 
 # Imath (>=3.1)
 # https://github.com/AcademySoftwareFoundation/Imath
-set(_Imath_ExternalProject_VERSION "3.1.5")
+set(_Imath_ExternalProject_VERSION "3.1.6")
 find_package(Imath 3.0 REQUIRED)
 
 ###############################################################################
diff --git a/share/cmake/modules/FindOpenEXR.cmake b/share/cmake/modules/FindOpenEXR.cmake
index a8b8448fd6..426305dab4 100644
--- a/share/cmake/modules/FindOpenEXR.cmake
+++ b/share/cmake/modules/FindOpenEXR.cmake
@@ -203,7 +203,8 @@ if(NOT OpenEXR_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACK
                 ${CMAKE_COMMAND} --build .
                                  --config ${CMAKE_BUILD_TYPE}
                                  --target install
-                                 --parallel
+                                # Prevent some CI jobs to fail when building.
+                                #  --parallel
         )
 
         # Additional targets. ALIAS to UNKNOWN imported target is only possible
diff --git a/share/cmake/projects/Buildlcms2.cmake b/share/cmake/projects/Buildlcms2.cmake
index ba3c7d0958..7292eeef48 100644
--- a/share/cmake/projects/Buildlcms2.cmake
+++ b/share/cmake/projects/Buildlcms2.cmake
@@ -15,12 +15,13 @@ file(GLOB SOURCES "src/*.c" "src/*.h")
 add_library(${PROJECT_NAME} STATIC ${HEADERS} ${SOURCES})
 
 if(UNIX)
-    set(lcms2_C_FLAGS "${lcms2_C_FLAGS} -fPIC")
+    set(lcms2_C_FLAGS "${lcms2_C_FLAGS};-fPIC")
 endif()
 
 set_target_properties(${PROJECT_NAME} PROPERTIES
     LIBRARY_OUTPUT_NAME "${PROJECT_NAME}"
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} ${lcms2_C_FLAGS}"
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};${lcms2_C_FLAGS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
     PUBLIC_HEADER "${HEADERS}"
 )
 
diff --git a/share/cmake/projects/Buildpystring.cmake b/share/cmake/projects/Buildpystring.cmake
index e580aa08e3..85dbb48675 100644
--- a/share/cmake/projects/Buildpystring.cmake
+++ b/share/cmake/projects/Buildpystring.cmake
@@ -18,11 +18,12 @@ set(SOURCES
 add_library(${PROJECT_NAME} STATIC ${HEADERS} ${SOURCES})
 
 if(UNIX)
-    set(pystring_CXX_FLAGS "${pystring_CXX_FLAGS} -fPIC")
+    set(pystring_CXX_FLAGS "${pystring_CXX_FLAGS};-fPIC")
 endif()
 
 set_target_properties(${PROJECT_NAME} PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} ${pystring_CXX_FLAGS}"
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};${pystring_CXX_FLAGS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
     PUBLIC_HEADER "${HEADERS}"
 )
 
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 9e56d55ae8..41d96838f7 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -5,72 +5,79 @@
 ###############################################################################
 # Define the global compilation and link flags.
 
-set(PLATFORM_COMPILE_FLAGS "")
+set(PLATFORM_COMPILE_OPTIONS "")
+set(PLATFORM_LINK_OPTIONS "")
 
 if(USE_MSVC)
 
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /DUSE_MSVC")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/DUSE_MSVC")
 
     # /we4062 Enables warning in switch when an enumeration value is not explicitly handled.
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /EHsc /DWIN32 /we4062")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/EHsc;/DWIN32;/we4062")
 
     if(${CMAKE_CXX_STANDARD} GREATER_EQUAL 17)
         # Inheriting from std::iterator is deprecated starting with C++17 and Yaml 0.6.3 does that.
-        set(PLATFORM_COMPILE_FLAGS 
-            "${PLATFORM_COMPILE_FLAGS} /D_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING"
+        set(PLATFORM_COMPILE_OPTIONS
+            "${PLATFORM_COMPILE_OPTIONS};/D_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING"
         )
     endif()
 
     # Explicitely specify the default warning level i.e. /W3.
     # Note: Do not use /Wall (i.e. /W4) which adds 'informational' warnings.
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /W3")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/W3")
 
     # Do enable C4701 (Potentially uninitialized local variable 'name' used), which is level 4.
     # This is because strtoX-based from_chars leave the value variable unmodified.
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /we4701")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/we4701")
 
     if(OCIO_WARNING_AS_ERROR)
-        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /WX")
+        set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/WX")
     endif()
 
 elseif(USE_CLANG)
 
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -DUSE_CLANG")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-DUSE_CLANG")
 
     # Use of 'register' specifier must be removed for C++17 support.
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wno-deprecated-register")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-deprecated-register")
 
 elseif(USE_GCC)
 
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -DUSE_GCC")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-DUSE_GCC")
 
 endif()
 
 
 if(USE_GCC OR USE_CLANG)
 
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wall")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wall")
 
     # Add more warning detection.
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wextra")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wextra")
 
     # -Wswitch-enum Enables warning in switch when an enumeration value is not explicitly handled.
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wswitch-enum")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wswitch-enum")
 
     if(OCIO_WARNING_AS_ERROR)
-        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Werror")
+        set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Werror")
     endif()
 
     if(APPLE)
         # TODO: There are still some deprecated methods.
-        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wno-deprecated-declarations")
+        set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-deprecated-declarations")
+    endif()
+
+    if(OCIO_ENABLE_SANITIZER)
+        set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-fno-omit-frame-pointer;-fsanitize=address")
+        set(PLATFORM_LINK_OPTIONS "${PLATFORM_LINK_OPTIONS};-fsanitize=address")
     endif()
 
 endif()
 
 # An advanced variable will not be displayed in any of the cmake GUIs
 
-mark_as_advanced(PLATFORM_COMPILE_FLAGS)
+mark_as_advanced(PLATFORM_COMPILE_OPTIONS)
+mark_as_advanced(PLATFORM_LINK_OPTIONS)
 
 
 ###############################################################################
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 1c4d774ddb..6c2693db2f 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -353,20 +353,13 @@ if(WIN32)
     endif()
 endif()
 
-set_target_properties(OpenColorIO PROPERTIES
-    OUTPUT_NAME   ${PROJECT_NAME}${OCIO_LIBNAME_SUFFIX}
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
-    VERSION       ${OpenColorIO_VERSION}
-    SOVERSION     ${SOVERSION}
-    PUBLIC_HEADER "${INSTALL_HEADERS}"
-)
+set(CUSTOM_LINK_FLAGS ${PLATFORM_LINK_OPTIONS})
 
 if(UNIX AND NOT APPLE)
     # Also hide all the symbols of dependent libraries to prevent clashes if
     # an app using this project is linked against other versions of our
     # dependencies.
-    set_property (TARGET OpenColorIO
-                  APPEND PROPERTY LINK_FLAGS "-Wl,--exclude-libs,ALL")
+    set(CUSTOM_LINK_FLAGS "${CUSTOM_LINK_FLAGS};-Wl,--exclude-libs,ALL")
 elseif(APPLE)
     if (expat_LIBRARY)
         get_filename_component(_expat_LIBDIR "${expat_LIBRARY}" DIRECTORY)
@@ -394,12 +387,20 @@ elseif(APPLE)
     endif()   
 
     if (_OCIO_LINK_FLAGS_LIST_)
-        list(JOIN _OCIO_LINK_FLAGS_LIST_ " " _OCIO_LINK_FLAGS_LIST_)
-        set_property (TARGET OpenColorIO
-                    APPEND PROPERTY LINK_FLAGS "${_OCIO_LINK_FLAGS_LIST_}")   
+        list(JOIN _OCIO_LINK_FLAGS_LIST_ ";" _OCIO_LINK_FLAGS_LIST_)
+        set(CUSTOM_LINK_FLAGS "${CUSTOM_LINK_FLAGS};${_OCIO_LINK_FLAGS_LIST_}")
     endif()              
 endif()
 
+set_target_properties(OpenColorIO PROPERTIES
+    OUTPUT_NAME   ${PROJECT_NAME}${OCIO_LIBNAME_SUFFIX}
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${CUSTOM_LINK_FLAGS}"
+    VERSION       ${OpenColorIO_VERSION}
+    SOVERSION     ${SOVERSION}
+    PUBLIC_HEADER "${INSTALL_HEADERS}"
+)
+
 if(MSVC AND BUILD_SHARED_LIBS)
     # Install the pdb file if any.
     install(FILES $<TARGET_PDB_FILE:${PROJECT_NAME}> DESTINATION ${CMAKE_INSTALL_BINDIR} OPTIONAL)
diff --git a/src/apps/ocioarchive/CMakeLists.txt b/src/apps/ocioarchive/CMakeLists.txt
index 6b868d1979..f39e428349 100644
--- a/src/apps/ocioarchive/CMakeLists.txt
+++ b/src/apps/ocioarchive/CMakeLists.txt
@@ -8,7 +8,9 @@ set(SOURCES
 add_executable(ocioarchive ${SOURCES})
 
 set_target_properties(ocioarchive PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_include_directories(ocioarchive
     PUBLIC
diff --git a/src/apps/ociobakelut/CMakeLists.txt b/src/apps/ociobakelut/CMakeLists.txt
index a50e87e48a..70cc43937e 100755
--- a/src/apps/ociobakelut/CMakeLists.txt
+++ b/src/apps/ociobakelut/CMakeLists.txt
@@ -9,7 +9,7 @@ set(SOURCES
 add_executable(ociobakelut ${SOURCES})
 
 if(MSVC)
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /wd4996")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/wd4996")
 endif()
 
 # NOTE: Depending of the compiler version lcm2 2.2 does not compile with C++17 so revert to c++11
@@ -21,7 +21,8 @@ endif()
 
 set_target_properties(ociobakelut
     PROPERTIES 
-        COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
+        COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+        LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
         CXX_STANDARD ${APP_CXX_STANDARD}
 )
 
diff --git a/src/apps/ociocheck/CMakeLists.txt b/src/apps/ociocheck/CMakeLists.txt
index 0307849c17..cd6cf787e3 100755
--- a/src/apps/ociocheck/CMakeLists.txt
+++ b/src/apps/ociocheck/CMakeLists.txt
@@ -8,11 +8,13 @@ set(SOURCES
 add_executable(ociocheck ${SOURCES})
 
 if(MSVC)
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /wd4996")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/wd4996")
 endif()
 
 set_target_properties(ociocheck PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_link_libraries(ociocheck 
     PRIVATE 
diff --git a/src/apps/ociochecklut/CMakeLists.txt b/src/apps/ociochecklut/CMakeLists.txt
index e676d3707a..59dc5daeac 100644
--- a/src/apps/ociochecklut/CMakeLists.txt
+++ b/src/apps/ociochecklut/CMakeLists.txt
@@ -15,11 +15,13 @@ set(SOURCES
 add_executable(ociochecklut ${SOURCES})
 
 if(MSVC)
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /wd4996")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/wd4996")
 endif()
 
 set_target_properties(ociochecklut PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_link_libraries(ociochecklut 
     PRIVATE 
diff --git a/src/apps/ocioconvert/CMakeLists.txt b/src/apps/ocioconvert/CMakeLists.txt
index c8f117bcdd..5748450775 100755
--- a/src/apps/ocioconvert/CMakeLists.txt
+++ b/src/apps/ocioconvert/CMakeLists.txt
@@ -15,7 +15,8 @@ set(SOURCES
 add_executable(ocioconvert ${SOURCES})
 
 set_target_properties(ocioconvert PROPERTIES
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
 )
 
 if (USE_MSVC)
diff --git a/src/apps/ociodisplay/CMakeLists.txt b/src/apps/ociodisplay/CMakeLists.txt
index cb54ad9af9..8cd63d893c 100755
--- a/src/apps/ociodisplay/CMakeLists.txt
+++ b/src/apps/ociodisplay/CMakeLists.txt
@@ -10,14 +10,15 @@ set(SOURCES main.cpp)
 
 add_executable(ociodisplay ${SOURCES})
 
-set(CUSTOM_COMPILE_FLAGS ${PLATFORM_COMPILE_FLAGS})
+set(CUSTOM_COMPILE_FLAGS ${PLATFORM_COMPILE_OPTIONS})
 if(APPLE)
     # Mute the deprecated warning for some GLUT methods.
-    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS} -DGL_SILENCE_DEPRECATION")
+    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS};-DGL_SILENCE_DEPRECATION")
 endif()
 
 set_target_properties(ociodisplay PROPERTIES 
-    COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS}"
+    COMPILE_OPTIONS "${CUSTOM_COMPILE_FLAGS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
 )
 
 if (USE_MSVC)
diff --git a/src/apps/ociolutimage/CMakeLists.txt b/src/apps/ociolutimage/CMakeLists.txt
index eceeb62154..9ee234603e 100755
--- a/src/apps/ociolutimage/CMakeLists.txt
+++ b/src/apps/ociolutimage/CMakeLists.txt
@@ -8,7 +8,9 @@ set(SOURCES
 add_executable(ociolutimage ${SOURCES})
 
 set_target_properties(ociolutimage PROPERTIES
-	COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+	COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+	LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 if (USE_MSVC)
 	# Temporary until fixed in OpenImageIO: Mute some warnings from OpenImageIO farmhash.h
diff --git a/src/apps/ociomakeclf/CMakeLists.txt b/src/apps/ociomakeclf/CMakeLists.txt
index e69036b053..0179d20558 100644
--- a/src/apps/ociomakeclf/CMakeLists.txt
+++ b/src/apps/ociomakeclf/CMakeLists.txt
@@ -8,11 +8,13 @@ set(SOURCES
 add_executable(ociomakeclf ${SOURCES})
 
 if(MSVC)
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /wd4996")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/wd4996")
 endif()
 
 set_target_properties(ociomakeclf PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_link_libraries(ociomakeclf 
     PRIVATE 
diff --git a/src/apps/ocioperf/CMakeLists.txt b/src/apps/ocioperf/CMakeLists.txt
index a374f5d785..efe1f0555b 100644
--- a/src/apps/ocioperf/CMakeLists.txt
+++ b/src/apps/ocioperf/CMakeLists.txt
@@ -8,7 +8,9 @@ set(SOURCES
 add_executable(ocioperf ${SOURCES})
 
 set_target_properties(ocioperf PROPERTIES
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_link_libraries(ocioperf
     PRIVATE
diff --git a/src/apps/ociowrite/CMakeLists.txt b/src/apps/ociowrite/CMakeLists.txt
index 8c9c1729c9..b17bee62c2 100644
--- a/src/apps/ociowrite/CMakeLists.txt
+++ b/src/apps/ociowrite/CMakeLists.txt
@@ -8,7 +8,9 @@ set(SOURCES
 add_executable(ociowrite ${SOURCES})
 
 set_target_properties(ociowrite PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_link_libraries(ociowrite
     PRIVATE 
diff --git a/src/apputils/CMakeLists.txt b/src/apputils/CMakeLists.txt
index 6f6c3be31a..e57941aed1 100644
--- a/src/apputils/CMakeLists.txt
+++ b/src/apputils/CMakeLists.txt
@@ -21,5 +21,6 @@ target_link_libraries(apputils
 )
 
 set_target_properties(apputils PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
 )
diff --git a/src/bindings/python/CMakeLists.txt b/src/bindings/python/CMakeLists.txt
index f674139934..d3c79db98a 100644
--- a/src/bindings/python/CMakeLists.txt
+++ b/src/bindings/python/CMakeLists.txt
@@ -138,20 +138,28 @@ if(${CMAKE_CXX_STANDARD} GREATER_EQUAL 17)
 	set(APP_CXX_STANDARD 11)
 endif()
 
-set(CUSTOM_COMPILE_FLAGS ${PLATFORM_COMPILE_FLAGS})
+set(CUSTOM_COMPILE_FLAGS ${PLATFORM_COMPILE_OPTIONS})
+set(CUSTOM_LINK_FLAGS ${PLATFORM_LINK_OPTIONS})
 
 # The Python binding contains deprecated methods for backward compatibility reason,
 # so disable the warning.
 if(USE_GCC OR USE_CLANG)
-    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS} -Wno-deprecated-declarations")
+    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS};-Wno-deprecated-declarations")
 elseif(USE_MSVC)
-    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS} /wd4996")
+    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS};/wd4996")
+endif()
+
+# OSX demands that the linker resolve all symbols at build time
+# we pass this flag to allow dynamic linking
+if(APPLE)
+    set(CUSTOM_LINK_FLAGS "${CUSTOM_LINK_FLAGS};-undefined;dynamic_lookup")
 endif()
 
 set_target_properties(PyOpenColorIO
 	PROPERTIES
-		COMPILE_FLAGS ${CUSTOM_COMPILE_FLAGS}
-		CXX_STANDARD  ${APP_CXX_STANDARD}
+		COMPILE_OPTIONS "${CUSTOM_COMPILE_FLAGS}"
+		LINK_OPTIONS "${CUSTOM_LINK_FLAGS}"
+		CXX_STANDARD ${APP_CXX_STANDARD}
 )
 
 if(NOT BUILD_SHARED_LIBS)
@@ -182,14 +190,6 @@ if (UNIX AND NOT CMAKE_SKIP_RPATH)
     endif()
 endif()
 
-# OSX demands that the linker resolve all symbols at build time
-# we pass this flag to allow dynamic linking
-if(APPLE)
-	set_target_properties(PyOpenColorIO PROPERTIES
-		LINK_FLAGS "-undefined dynamic_lookup"
-	)
-endif()
-
 target_include_directories(PyOpenColorIO
 	PRIVATE
 		PyOpenColorIO
diff --git a/src/libutils/imageioapphelpers/CMakeLists.txt b/src/libutils/imageioapphelpers/CMakeLists.txt
index 3f2bcb1f86..6b32f2824e 100644
--- a/src/libutils/imageioapphelpers/CMakeLists.txt
+++ b/src/libutils/imageioapphelpers/CMakeLists.txt
@@ -17,7 +17,8 @@ if(NOT BUILD_SHARED_LIBS)
 endif()
 
 set_target_properties(imageioapphelpers PROPERTIES
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
 )
 
 target_compile_definitions(imageioapphelpers
diff --git a/src/libutils/oglapphelpers/CMakeLists.txt b/src/libutils/oglapphelpers/CMakeLists.txt
index d5eeb4d6bb..39a20ecb3a 100644
--- a/src/libutils/oglapphelpers/CMakeLists.txt
+++ b/src/libutils/oglapphelpers/CMakeLists.txt
@@ -42,14 +42,15 @@ if(NOT BUILD_SHARED_LIBS)
     )
 endif()
 
-set(CUSTOM_COMPILE_FLAGS ${PLATFORM_COMPILE_FLAGS})
+set(CUSTOM_COMPILE_FLAGS ${PLATFORM_COMPILE_OPTIONS})
 if(APPLE)
     # Mute the deprecated warning for some GLUT methods.
-    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS} -DGL_SILENCE_DEPRECATION")
+    set(CUSTOM_COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS};-DGL_SILENCE_DEPRECATION")
 endif()
 
 set_target_properties(oglapphelpers PROPERTIES 
-    COMPILE_FLAGS "${CUSTOM_COMPILE_FLAGS}"
+    COMPILE_OPTIONS "${CUSTOM_COMPILE_FLAGS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
 )
 
 target_include_directories(oglapphelpers
diff --git a/src/libutils/oglapphelpers/msl.mm b/src/libutils/oglapphelpers/msl.mm
index 2ddb545f0e..88ab273550 100644
--- a/src/libutils/oglapphelpers/msl.mm
+++ b/src/libutils/oglapphelpers/msl.mm
@@ -396,7 +396,7 @@ void RGB_to_RGBA(const float* lutValues, int valueCount, std::vector<float>& flo
                 const int dummyInt = 123456789;
                 const int* v = size == 0 ? &dummyInt : data.m_vectorInt.m_getVector();
                 size = size == 0 ? sizeof(int) : size * sizeof(int);
-                [renderCmdEncoder setFragmentBytes:v length:size * sizeof(int) atIndex:uniformId++];
+                [renderCmdEncoder setFragmentBytes:v length:size atIndex:uniformId++];
             }
             break;
                 
diff --git a/tests/cmake-consumer/CMakeLists.txt.in b/tests/cmake-consumer/CMakeLists.txt.in
index e7ba2a48d8..f965980bd3 100644
--- a/tests/cmake-consumer/CMakeLists.txt.in
+++ b/tests/cmake-consumer/CMakeLists.txt.in
@@ -3,7 +3,7 @@
 
 # Check the OCIO CMake config find module
 
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.13)
 project(consumer LANGUAGES CXX)
 
 if(NOT CMAKE_BUILD_TYPE)
@@ -17,9 +17,15 @@ find_package(OpenColorIO CONFIG REQUIRED)
 
 add_executable(consumer consumer.cpp)
 
+if(OCIO_ENABLE_SANITIZER)
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-fno-omit-frame-pointer;-fsanitize=address")
+    set(PLATFORM_LINK_OPTIONS "${PLATFORM_LINK_OPTIONS};-fsanitize=address")
+endif()
+
 set_target_properties(consumer
     PROPERTIES
-        COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
+        COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+        LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
         CXX_STANDARD 11
         CXX_STANDARD_REQUIRED ON
         CXX_EXTENSIONS OFF
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index e78a86bede..431d570f4e 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -14,7 +14,6 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     )
     target_link_libraries(${TEST_BINARY}
         PRIVATE
-            OpenColorIO
             expat::expat
             Imath::Imath
             pystring::pystring
@@ -39,6 +38,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     if(PRIVATE_INCLUDES)
         target_include_directories(${TEST_BINARY}
             PRIVATE
+                "$<TARGET_PROPERTY:OpenColorIO,INCLUDE_DIRECTORIES>"
                 "${CMAKE_SOURCE_DIR}/tests/cpu"
                 "${CMAKE_BINARY_DIR}/generated_include"
         )
@@ -67,9 +67,21 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
         endif()
     endif(WIN32)
     set_target_properties(${TEST_BINARY} PROPERTIES
-        COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+        COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+        LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+    )
 
     add_test(NAME ${TEST_NAME} COMMAND ${TEST_BINARY})
+
+    if(OCIO_ENABLE_SANITIZER)
+        # Ignore odr-violation warning coming supposeddly from compiling OCIO
+        # sources within the test target as well as linking to the library.
+        # Provide better stack traces for malloc related leaks.
+        set_tests_properties(${TEST_NAME} PROPERTIES
+            ENVIRONMENT
+                "ASAN_OPTIONS=detect_odr_violation=0:fast_unwind_on_malloc=0"
+        )
+    endif()
 endfunction(add_ocio_test)
 
 # Eventually we will factor out each test into it's own executable
diff --git a/tests/cpu/Platform_tests.cpp b/tests/cpu/Platform_tests.cpp
index a610e02826..dc3caab795 100644
--- a/tests/cpu/Platform_tests.cpp
+++ b/tests/cpu/Platform_tests.cpp
@@ -84,7 +84,7 @@ OCIO_ADD_TEST(Platform, getenv)
     OCIO_CHECK_NE(GetEnvironmentVariable(TEXT("PATH"), NULL, 0), 0);
 
     // Create a variable and test that it's retrievable through the Windows API.
-    OCIO::Platform::Setenv(u8"MY_WINDOWS_DUMMY_ENV", u8"SomeValue");
+    OCIO::Platform::Setenv(U8("MY_WINDOWS_DUMMY_ENV"), U8("SomeValue"));
     uint32_t win_env_sz = GetEnvironmentVariable(TEXT("MY_WINDOWS_DUMMY_ENV"), NULL, 0);
     OCIO_CHECK_NE(win_env_sz, 0);
 
@@ -93,7 +93,7 @@ OCIO_ADD_TEST(Platform, getenv)
     win_env_value.pop_back(); // Remove null terminator that interferes with comparison
     OCIO_CHECK_ASSERT(win_env_value == TEXT("SomeValue"));
 
-    OCIO::Platform::Unsetenv(u8"MY_WINDOWS_DUMMY_ENV");
+    OCIO::Platform::Unsetenv(U8("MY_WINDOWS_DUMMY_ENV"));
     OCIO_CHECK_EQUAL(GetEnvironmentVariable(TEXT("MY_WINDOWS_DUMMY_ENV"), NULL, 0), 0);
     OCIO_CHECK_EQUAL(GetLastError(), ERROR_ENVVAR_NOT_FOUND);
 #endif
diff --git a/tests/cpu/UnitTestUtils.cpp b/tests/cpu/UnitTestUtils.cpp
index 9456e882d1..f298db0f3e 100644
--- a/tests/cpu/UnitTestUtils.cpp
+++ b/tests/cpu/UnitTestUtils.cpp
@@ -159,8 +159,9 @@ void removeDirectory(const char * directoryPath)
                 }   
             }   
         }
-    }   
+    }
     remove(directoryPath);
+    closedir(dir);
 }
 #endif
 
@@ -174,5 +175,3 @@ void RemoveTemporaryDirectory(const std::string & directoryPath)
 }
 
 } // namespace OCIO_NAMESPACE
-
-
diff --git a/tests/cpu/fileformats/xmlutils/XMLReaderUtils_tests.cpp b/tests/cpu/fileformats/xmlutils/XMLReaderUtils_tests.cpp
index 06c92ddebc..ad33e496ef 100644
--- a/tests/cpu/fileformats/xmlutils/XMLReaderUtils_tests.cpp
+++ b/tests/cpu/fileformats/xmlutils/XMLReaderUtils_tests.cpp
@@ -76,10 +76,10 @@ OCIO_ADD_TEST(XMLReaderHelper, get_numbers)
     // Same test without a null terminated string:
     // Copy the string into a buffer that will not be null terminated.
     // Add a delimiter at the end of the buffer.
-    char * buffer = new char[len+1];
-    std::memcpy(buffer, str, len * sizeof(char));
+    std::vector<char> buffer(len+1);
+    std::memcpy(buffer.data(), str, len * sizeof(char));
     buffer[len] = '\n';
-    OCIO_CHECK_NO_THROW(values = OCIO::GetNumbers<float>(buffer, len));
+    OCIO_CHECK_NO_THROW(values = OCIO::GetNumbers<float>(buffer.data(), len));
     OCIO_REQUIRE_EQUAL(values.size(), 4);
     OCIO_CHECK_EQUAL(values[0], 1.0f);
     OCIO_CHECK_EQUAL(values[1], 2.0f);
@@ -498,4 +498,3 @@ OCIO_ADD_TEST(XMLReaderHelper, find_sub_string)
         OCIO_CHECK_EQUAL(end, 0);
     }
 }
-
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index 051abe72eb..2245fbf1d5 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -34,7 +34,9 @@ if(OCIO_USE_SSE)
 endif(OCIO_USE_SSE)
 
 set_target_properties(test_gpu_exec PROPERTIES 
-	COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+	COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+	LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_link_libraries(test_gpu_exec
 	PRIVATE
diff --git a/tests/osl/CMakeLists.txt b/tests/osl/CMakeLists.txt
index 6e2c107a89..3316720d47 100644
--- a/tests/osl/CMakeLists.txt
+++ b/tests/osl/CMakeLists.txt
@@ -26,7 +26,9 @@ if(OCIO_USE_SSE)
 endif(OCIO_USE_SSE)
 
 set_target_properties(test_osl_exec PROPERTIES 
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 # Because some OpenImageIO types are present in the OSL public API that dependency is mandatory;
 # however, the unit test framework itself does not have any OpenImageIO dependency.
diff --git a/tests/testutils/CMakeLists.txt b/tests/testutils/CMakeLists.txt
index ecf15498e9..91e9ca1989 100644
--- a/tests/testutils/CMakeLists.txt
+++ b/tests/testutils/CMakeLists.txt
@@ -8,7 +8,9 @@ set(SOURCES
 add_library(testutils STATIC ${SOURCES})
 
 set_target_properties(testutils PROPERTIES
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 target_include_directories(testutils
     PUBLIC
@@ -20,4 +22,3 @@ target_link_libraries(testutils
         apputils
         utils::strings
 )
-
diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt
index 9fc1e6b474..242cad4f0d 100644
--- a/tests/utils/CMakeLists.txt
+++ b/tests/utils/CMakeLists.txt
@@ -17,6 +17,8 @@ target_link_libraries(test_utils_exec
 )
 
 set_target_properties(test_utils_exec PROPERTIES
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}")
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
+)
 
 add_test(NAME test_utils_exec COMMAND test_utils_exec)
diff --git a/vendor/openfx/CMakeLists.txt b/vendor/openfx/CMakeLists.txt
index 57e26253a1..d734cdf164 100644
--- a/vendor/openfx/CMakeLists.txt
+++ b/vendor/openfx/CMakeLists.txt
@@ -32,18 +32,19 @@ add_library(ofxplugin MODULE ${SOURCES} ${OFXS_SOURCES})
 
 # Disable known compiler warnings from OpenFX Support Library
 if(MSVC)
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} /wd4996 /wd4101")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/wd4996;/wd4101")
 else()
     # Some reported unused parameters in openfx are used when DEBUG macro is 
     # defined
-    set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -Wno-unused-parameter")
+    set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-unused-parameter")
 endif()
 
 set_target_properties(ofxplugin PROPERTIES
     PREFIX ""
     OUTPUT_NAME "OpenColorIO"
     SUFFIX ".ofx"
-    COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS}"
+    COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
+    LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"
 )
 
 target_include_directories(ofxplugin

From caa2fce7f98f35bc3db15bfa383932f642a16fed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Thu, 9 Mar 2023 00:57:11 -0500
Subject: [PATCH 49/81] Adsk Contrib - Allow PyOpenColorIO module to load DLLs
 from Windows PATH environment variable (#1759)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Allow PyOpenColorIO module to load DLLs from Windows PATH environment variable with an opt-out option in case the user want the default behavior of Python 3.8+.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fixing typos in comments

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

---------

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/bindings/python/CMakeLists.txt | 12 +++++++++---
 src/bindings/python/__init__.py    | 24 ++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 src/bindings/python/__init__.py

diff --git a/src/bindings/python/CMakeLists.txt b/src/bindings/python/CMakeLists.txt
index d3c79db98a..8588700a0c 100644
--- a/src/bindings/python/CMakeLists.txt
+++ b/src/bindings/python/CMakeLists.txt
@@ -227,16 +227,22 @@ target_compile_definitions(PyOpenColorIO
 		PY_VERSION_PATCH=${Python_VERSION_PATCH}
 )
 
+# Set to site-package location.
 if(WIN32)
     set(_Python_VARIANT_PATH "${CMAKE_INSTALL_LIBDIR}/site-packages")
 else()
     set(_Python_VARIANT_PATH "${CMAKE_INSTALL_LIBDIR}/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages")
 endif()
 
-# Create an internal global variable to access it in another scope but not publicly visible
-# using ccmake.
+# Create an internal global variable to access it in another scope but not publicly visible.
+# The site-package location is needed in setup_ocio.bat.in and setup_ocio.sh.in.
 set(PYTHON_VARIANT_PATH ${_Python_VARIANT_PATH} CACHE INTERNAL "")
 
+# Set to PyOpenColorIO site-package location.
+set(_PyOpenColorIO_SITE_PACKAGE_DIR "${PYTHON_VARIANT_PATH}/PyOpenColorIO")
+
 install(TARGETS PyOpenColorIO
-    LIBRARY DESTINATION ${_Python_VARIANT_PATH}
+    LIBRARY DESTINATION ${_PyOpenColorIO_SITE_PACKAGE_DIR}
 )
+
+install(FILES __init__.py DESTINATION ${_PyOpenColorIO_SITE_PACKAGE_DIR})
\ No newline at end of file
diff --git a/src/bindings/python/__init__.py b/src/bindings/python/__init__.py
new file mode 100644
index 0000000000..3ce17e051e
--- /dev/null
+++ b/src/bindings/python/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+import os, sys, platform
+
+#
+# Python 3.8+ has stopped loading DLLs from PATH environment variable on Windows.
+#
+# This code reproduce the old behavior (loading DLLs from PATH) by doing the following:
+# 1 - Tokenizing PATH
+# 2 - Checking that the directories exist and are not "."
+# 3 - Add them to the DLL load path.
+#
+# The behavior described above is opt-out which means that it is activated by default. 
+# A user can opt-out and use the default behavior of Python 3.8+ by setting OCIO_PYTHON_LOAD_DLLS_FROM_PATH 
+# environment variable to 0.
+#
+
+if sys.version_info >= (3, 8) and platform.system() == "Windows" and os.getenv("OCIO_PYTHON_LOAD_DLLS_FROM_PATH", "1") == "1":
+    for path in os.getenv("PATH", "").split(os.pathsep):
+        if os.path.exists(path) and path != ".":
+            os.add_dll_directory(path)
+
+from .PyOpenColorIO import *
\ No newline at end of file

From 39345e1d29f45a3a95bf55b43c2ad5d08d70823c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Thu, 9 Mar 2023 01:59:12 -0500
Subject: [PATCH 50/81] Changing the build type check to be case-insensitive
 (#1765)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e927ed47db..9f3fa518f0 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,11 +94,26 @@ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeR
 
 # Is that a valid build type?
 
-if(NOT "${CMAKE_BUILD_TYPE}" IN_LIST CMAKE_CONFIGURATION_TYPES)
+set(_build_type_valid_ false)
+foreach (_build_type_ ${CMAKE_CONFIGURATION_TYPES})
+    string(TOLOWER ${_build_type_} _lowercase_build_type_)
+    string(TOLOWER ${CMAKE_BUILD_TYPE} _lowercase_cmake_build_type_)
+
+    if (_lowercase_cmake_build_type_ STREQUAL _lowercase_build_type_)
+        # Build type is supported.
+        set(_build_type_valid_ true)
+        break()
+    endif()
+endforeach()
+unset(_lowercase_build_type_)
+unset(_lowercase_cmake_build_type_)
+
+if (NOT _build_type_valid_)
     string(REPLACE ";" ", " _CMAKE_CONFIGURATION_TYPES_STR "${CMAKE_CONFIGURATION_TYPES}")
     message(FATAL_ERROR 
             "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} is unsupported. Supported values are: ${_CMAKE_CONFIGURATION_TYPES_STR}.")
 endif()
+unset(_build_type_valid_)
 
 # Is that in debug mode?
 

From 1d7d99ec306eaa24edf3d979ad6ec0a9157e627a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?=
 <105517825+cedrik-fuoco-adsk@users.noreply.github.com>
Date: Wed, 22 Mar 2023 17:12:35 -0400
Subject: [PATCH 51/81] Adsk contrib - Add support for minimum and recommended
 versions for dependencies (#1777)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Implementation of ocio_find_package and ocio_install_package macro.
Implemention of minimum, maximum and recommended version.
Refactor the FindOpenImageIO to be inline with the rest of the custom find module (OCIO_USE_OIIO_CMAKE_CONFIG is no longer needed).
Added a new option called OCIO_VERBOSE. It allow the user to tell OCIO to display more information when searching and building the dependencies.
Splitted the find and install part into two files - Find<pkg>.cmake and Install<pkg>.cmake.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing a duplicate call to set_property in OpenImageIO find module.
Setting policy CMP0042 when building ZLIB since that project is using an old CMake version as the cmake_minimum_required and that version has no knowledge of the policy.
Fixing a potential issue in Installopenfx. Changing back the version to 1.4 as before.
Ignoring warning from OpenImageIO for imageioapphelpers. The same warning are ignored for ociodisplay, ocioconvert and ociolutimage.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fixed an issue with FindOpenImageIO that was found while testing on macOS.
Renamed FindOpenShadingLanguage to FindOSL to match the project name used the OSL's CMakefile.
Cleanup the code for FindOSL and removed duplicate warning about needed C++14.

Updated the header comments for all Find and Install modules.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fixing typo

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Renamed ocio_find_package to ocio_handle_dependency
Renamed ocio_install_package to ocio_install_dependency
The "Installing [...]" message from Install module is now under OCIO_VERBOSE variable. Added a message in ocio_install_dependency instead
Improve the colors usage in the logging

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Changed prefix for ocio_install_dependency since it was conflicting with the prefix of ocio_handle_dependency since they were the same.
Removed FindOpenEXR.cmake and FindOpenImageIO.cmake since they are not needed anymore (for differente reason).
Added PROMOTE_TARGET option for ocio_handle_dependency which promote the target to GLOBAL.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Mostly comments and documentations

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Small update of the Existing Install Hints section

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Changed RECOMMENDED_MIN_VERSION to RECOMMENDED_VERSION
Re-worded some of the comments

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fixing issue with ocio_handle_dependency macro where it wasn't respecting OCIO_INSTALL_EXT_PACKAGES option correctly.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Adding more documentations and dropping support to look for static zlib since the user can update their cmake in order to do that.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Documentations

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Documentations

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* fixing typo

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Fix typo and fix issue when OCIO is installing ZLIB

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Tentative fix for Linux CI failure

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Ignoring specifics warnings on OpenImageIO target directly.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

* Removing OCIO_USE_OIIO_CMAKE_CONFIG as it is not needed anymore.

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>

---------

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                |   8 +-
 docs/quick_start/installation.rst             |  69 +++--
 share/cmake/macros/VersionUtils.cmake         |  17 +-
 .../ocio_check_dependency_version.cmake       |  38 +++
 .../cmake/macros/ocio_handle_dependency.cmake | 234 ++++++++++++++++
 .../macros/ocio_install_dependency.cmake      |  46 +++
 share/cmake/modules/FindExtPackages.cmake     | 263 +++++++++++-------
 share/cmake/modules/FindImath.cmake           | 144 ++--------
 ...penShadingLanguage.cmake => FindOSL.cmake} | 103 +++----
 share/cmake/modules/FindOpenImageIO.cmake     | 174 ------------
 share/cmake/modules/FindSphinx.cmake          |  12 +-
 share/cmake/modules/Findexpat.cmake           | 150 ++--------
 share/cmake/modules/Findlcms2.cmake           | 125 +--------
 share/cmake/modules/Findminizip-ng.cmake      | 134 ++-------
 share/cmake/modules/Findopenfx.cmake          |  68 +----
 share/cmake/modules/Findpybind11.cmake        | 102 +------
 share/cmake/modules/Findpystring.cmake        | 120 ++------
 share/cmake/modules/Findyaml-cpp.cmake        | 151 ++--------
 .../cmake/modules/install/InstallImath.cmake  | 154 ++++++++++
 .../InstallOpenEXR.cmake}                     |  77 ++---
 .../modules/{ => install}/InstallZLIB.cmake   |  40 +--
 .../cmake/modules/install/Installexpat.cmake  | 161 +++++++++++
 .../cmake/modules/install/Installlcms2.cmake  | 142 ++++++++++
 .../modules/install/Installminizip-ng.cmake   | 154 ++++++++++
 .../cmake/modules/install/Installopenfx.cmake |  86 ++++++
 .../modules/install/Installpybind11.cmake     | 122 ++++++++
 .../modules/install/Installpystring.cmake     | 134 +++++++++
 .../modules/install/Installyaml-cpp.cmake     | 153 ++++++++++
 share/cmake/utils/CheckSupportGL.cmake        |  27 +-
 share/cmake/utils/Colors.cmake                |  16 ++
 share/cmake/utils/CompilerFlags.cmake         |   2 +
 src/cmake/Config.cmake.in                     |  51 +---
 32 files changed, 1904 insertions(+), 1373 deletions(-)
 create mode 100644 share/cmake/macros/ocio_check_dependency_version.cmake
 create mode 100644 share/cmake/macros/ocio_handle_dependency.cmake
 create mode 100644 share/cmake/macros/ocio_install_dependency.cmake
 rename share/cmake/modules/{FindOpenShadingLanguage.cmake => FindOSL.cmake} (62%)
 delete mode 100644 share/cmake/modules/FindOpenImageIO.cmake
 create mode 100644 share/cmake/modules/install/InstallImath.cmake
 rename share/cmake/modules/{FindOpenEXR.cmake => install/InstallOpenEXR.cmake} (75%)
 rename share/cmake/modules/{ => install}/InstallZLIB.cmake (78%)
 create mode 100644 share/cmake/modules/install/Installexpat.cmake
 create mode 100644 share/cmake/modules/install/Installlcms2.cmake
 create mode 100644 share/cmake/modules/install/Installminizip-ng.cmake
 create mode 100644 share/cmake/modules/install/Installopenfx.cmake
 create mode 100644 share/cmake/modules/install/Installpybind11.cmake
 create mode 100644 share/cmake/modules/install/Installpystring.cmake
 create mode 100644 share/cmake/modules/install/Installyaml-cpp.cmake
 create mode 100644 share/cmake/utils/Colors.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f3fa518f0..932874f81c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,7 @@ set(CMAKE_MODULE_PATH
     ${CMAKE_SOURCE_DIR}/share/cmake/utils
     ${CMAKE_SOURCE_DIR}/share/cmake/macros
     ${CMAKE_SOURCE_DIR}/share/cmake/modules
+    ${CMAKE_SOURCE_DIR}/share/cmake/modules/install
 )
 
 set(CMAKE_WARN_DEPRECATED ON)
@@ -156,6 +157,9 @@ if (WIN32)
     option(OCIO_USE_WINDOWS_UNICODE "Compile with Windows Unicode support" ON)
 endif()
 
+###############################################################################
+# Other preferences
+option(OCIO_VERBOSE "Display more information when searching or installing dependencies" OFF)
 
 ###############################################################################
 # Warnings / debugging settings
@@ -171,13 +175,13 @@ endif()
 # Optimization / internal linking preferences
 
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
-option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
 
 ###############################################################################
 # GPU configuration
-
+message(STATUS "")
+message(STATUS "Checking for GPU configuration...")
 include(CheckSupportGL)
 
 
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 600fa89e12..788ae7e16b 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -187,10 +187,26 @@ Three ``OCIO_INSTALL_EXT_PACKAGES`` options are available::
 Existing Install Hints
 ++++++++++++++++++++++
 
-When using libraries already on your system, the CMake variable 
-``-D <Package Name>_ROOT=<Path>`` may be used to specify the path to the include and 
-library root directory rather than have CMake try to find it.  The package names used 
-by OCIO are as follows (note that these are case-sensitive):
+If the library is not installed in a typical location where CMake will find it, 
+you may specify the location using one of the following methods:
+
+- Set ``-D<package_name>_DIR`` to point to the directory containing the CMake configuration file for the package.
+
+- Set ``-D<package_name>_ROOT`` to point to the directory containing the lib and include directories.
+
+- Set ``-D<package_name>_LIBRARY`` and ``-D<package_name>_INCLUDE_DIR`` to point to the lib and include directories.
+
+Not all packages support all of the above options. Please refer the 
+OCIO CMake `find modules <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/cmake/modules>`_  for the package that you are having trouble with to see the options it supports.
+
+Usually CMake will use the dynamic library rather than static, if both are present. In this case, 
+you may set <package_name>_STATIC_LIBRARY to ON to request use of the static one. If only the 
+static library is present (such as when OCIO builds the dependency), then the option is not needed.
+The following packages support this option:
+``expat``, ``yaml-cpp``, ``Imath``, ``lcms2``, and ``minizip-ng``. Using CMake 3.24+, it is
+possible to prefer the static version of ``ZLIB`` with ``-DZLIB_USE_STATIC_LIBS=ON``.
+
+The package names used by OCIO are as follows (note that these are case-sensitive):
 
 Required:
 
@@ -214,21 +230,6 @@ Optional:
 - ``GLUT``
 - ``Python``
 
-There are scenarios in which some of the dependencies may not be compiled into an 
-OCIO dynamic library.  This is more likely when OCIO does not download the packages
-itself.  In these cases, it may be helpful to additionally specify the CMake variable
-``-D <Package Name>_STATIC_LIBRARY=ON``. The following package names support this hint:
-``expat``, ``yaml-cpp``, ``Imath``, ``lcms2``, ``ZLIB``, and ``minizip-ng``.
-
-Rather than using ``_ROOT``, and possibly ``_STATIC_LIBRARY``, you may instead use
-``-D <Package Name>_LIBRARY=<Path>`` and ``-D <Package Name>_INCLUDE_DIR=<Path>``.
-In this case, the library path will control whether a static or dynamic library is used.
-It may also be used to handle situations where the library and/or include files are not
-in the typical location relative to the root directory.
-
-The OCIO `CMake find modules <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/cmake/modules>`_ 
-may be consulted for more detail on the handling of a given package and the CMake
-variables it uses.
 
 Please note that if you provide your own ``minizip-ng``, rather than having OCIO's CMake
 download and build it, you will likely need to set its CMake variables the same way
@@ -384,7 +385,7 @@ Windows
 While build environments may vary between users, the recommended way to build OCIO from source on 
 Windows 7 or newer is to use the scripts provided in the Windows 
 `share <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/dev/windows>`_ 
-section of the OCIO repository. There are two scripts currently available. 
+section of the OCIO repository. There are two scripts currently available.
 
 The first script is called 
 `ocio_deps.bat <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/dev/windows/ocio_deps.bat>`_ 
@@ -396,14 +397,18 @@ and it provides some automation to install the most difficult dependencies. Thos
 - Glew
 - Python dependencies for documentation
 
-Run this command to execute the ocio_deps.bat script::
+Run this command to execute the ocio_deps.bat script:
 
+.. code-block:: bash
+    
     ocio_deps.bat --vcpkg <path to current vcpkg installation or where it should be installed>
 
 The second script is called 
 `ocio.bat <https://github.com/AcademySoftwareFoundation/OpenColorIO/tree/main/share/dev/windows/ocio.bat>`_ 
 and it provide a way to configure and build OCIO from source. Moreover, this script executes the 
-install step of ``cmake`` as well as the unit tests. The main use case is the following::
+install step of ``cmake`` as well as the unit tests. The main use case is the following:
+
+.. code-block:: bash
 
     ocio.bat --b <path to build folder> --i <path to install folder> 
     --vcpkg <path to vcpkg installation> --ocio <path to ocio repository> --type Release
@@ -422,14 +427,26 @@ Quick environment configuration
 
 The quickest way to set the required :ref:`environment-setup` is to
 source the ``share/ocio/setup_ocio.sh`` script installed with OCIO.
-On Windows, use the corresponding setup_ocio.bat file.
+On Windows, use the corresponding setup_ocio.bat file. See OCIO's install directory under 
+share/ocio.
+
+For a temporary configuration of your terminal, you can run the following script:
+
+.. code-block:: bash
+
+   # Windows - Execute setup_ocio.bat
+   [... path to OCIO install directory]/share/ocio/setup_ocio.bat
+   # Unix - Execute setup_ocio.sh
+   [... path to OCIO install directory]\share\ocio\setup_ocio.sh
 
-For a simple single-user setup, add the following to ``~/.bashrc``
+For a more permanent option, add the following to ``~/.bashrc``
 (assuming you are using bash, and the example install directory of
-``/software/ocio``)::
+``/software/ocio``):
 
-    source /software/ocio/share/ocio/setup_ocio.sh
+.. code-block:: bash
 
+   source /software/ocio/share/ocio/setup_ocio.sh
+    
 The only environment variable you must configure manually is
 :envvar:`OCIO`, which points to the configuration file you wish to
 use. For prebuilt config files, see the
diff --git a/share/cmake/macros/VersionUtils.cmake b/share/cmake/macros/VersionUtils.cmake
index 9349c5da56..0283774e4c 100644
--- a/share/cmake/macros/VersionUtils.cmake
+++ b/share/cmake/macros/VersionUtils.cmake
@@ -6,8 +6,19 @@
 
 macro(split_version_string version_var output_prefix)
     string(REPLACE "." ";" _version_var_list ${version_var})
-    list(GET _version_var_list 0 ${output_prefix}_VERSION_MAJOR)
-    list(GET _version_var_list 1 ${output_prefix}_VERSION_MINOR)
-    list(GET _version_var_list 2 ${output_prefix}_VERSION_PATCH)
+    list(LENGTH _version_var_list _version_var_list_length)
+
+    if (_version_var_list_length GREATER_EQUAL 1)
+        list(GET _version_var_list 0 ${output_prefix}_VERSION_MAJOR)
+    endif()
+
+    if (_version_var_list_length GREATER_EQUAL 2)
+        list(GET _version_var_list 1 ${output_prefix}_VERSION_MINOR)
+    endif()
+
+    if (_version_var_list_length GREATER_EQUAL 3)
+        list(GET _version_var_list 2 ${output_prefix}_VERSION_PATCH)
+    endif()
+
     unset(_version_var_list)
 endmacro()
diff --git a/share/cmake/macros/ocio_check_dependency_version.cmake b/share/cmake/macros/ocio_check_dependency_version.cmake
new file mode 100644
index 0000000000..2c2b741192
--- /dev/null
+++ b/share/cmake/macros/ocio_check_dependency_version.cmake
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+###################################################################################################
+# ocio_check_dependency_version try to find the specified dependency and validate the version.
+#
+# Note that a function is used here to scoped-in any variables set by find_package. We do not want
+# those variables to be propagated to the caller of the function.
+#
+# Argument:
+#   dep_name is the name of the dependency (package). Please note that dep_name is case sensitive.
+#
+###################################################################################################
+
+function (ocio_check_dependency_version dep_name output)
+    cmake_parse_arguments(
+        # prefix - Must be different than the one used in ocio_handle_dependency.cmake.
+        ocio_cdv
+        # options
+        ""
+        # one value keywords
+        "MIN_VERSION"
+        # multi value keywords
+        ""
+        # args
+        ${ARGN})
+        
+    if (dep_name)
+        find_package(${dep_name} ${ocio_cdv_UNPARSED_ARGUMENTS})
+        if (ocio_cdv_MIN_VERSION AND ${dep_name}_VERSION)
+            if (${${dep_name}_VERSION} VERSION_GREATER_EQUAL ocio_cdv_MIN_VERSION)
+                set(${output} TRUE)
+            else()
+                set(${output} FALSE)
+            endif()
+        endif()
+    endif()
+endfunction()
\ No newline at end of file
diff --git a/share/cmake/macros/ocio_handle_dependency.cmake b/share/cmake/macros/ocio_handle_dependency.cmake
new file mode 100644
index 0000000000..f7bb593561
--- /dev/null
+++ b/share/cmake/macros/ocio_handle_dependency.cmake
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+include(ocio_install_dependency)
+
+###################################################################################################
+# ocio_print_versions_error is a wrapper for messages when the dependency is not found.
+#
+# Arguments:
+#   dep_name is the name of the dependency (package). Please note that dep_name is case sensitive.
+#   message_color is the color of the message.
+#
+# Note that this macro is used in ocio_handle_dependency and should be there only.
+###################################################################################################
+macro (ocio_print_versions_error dep_name message_color)
+    if(NOT ocio_dep_MIN_VERSION AND NOT ocio_dep_MAX_VERSION AND NOT ocio_dep_RECOMMENDED_VERSION)
+        message(STATUS "${message_color}Could NOT find ${dep_name} (no version specified)${ColorReset}")
+    elseif(NOT ocio_dep_MIN_VERSION AND NOT ocio_dep_RECOMMENDED_VERSION)
+        message(STATUS "${message_color}Could NOT find ${dep_name} (maximum version: \"${ocio_dep_MAX_VERSION)\"${ColorReset}")
+    elseif(NOT ocio_dep_MIN_VERSION AND NOT ocio_dep_MAX_VERSION)
+        message(STATUS "${message_color}Could NOT find ${dep_name} (recommended version: \"${ocio_dep_RECOMMENDED_VERSION}\")\"${ColorReset}")
+    elseif(NOT ocio_dep_RECOMMENDED_VERSION AND NOT ocio_dep_MAX_VERSION)
+        message(STATUS "${message_color}Could NOT find ${dep_name} (minimum version: \"${ocio_dep_MIN_VERSION})\"${ColorReset}")
+    elseif(NOT ocio_dep_MIN_VERSION)
+        message(STATUS "${message_color}Could NOT find ${dep_name} (recommended version: \"${ocio_dep_RECOMMENDED_VERSION}\", maximum version: \"${ocio_dep_MAX_VERSION}\")${ColorReset}")
+    elseif(NOT ocio_dep_RECOMMENDED_VERSION)
+        message(STATUS "${message_color}Could NOT find ${dep_name} (minimum version: \"${ocio_dep_MIN_VERSION}\",  maximum version: \"${ocio_dep_MAX_VERSION}\")${ColorReset}")
+    elseif(NOT ocio_dep_MAX_VERSION)
+        message(STATUS "${message_color}Could NOT find ${dep_name} (minimum version: \"${ocio_dep_MIN_VERSION}\", recommended version: \"${ocio_dep_RECOMMENDED_VERSION}\")${ColorReset}")
+    else()
+        message(STATUS "${message_color}Could NOT find ${dep_name} (minimum version: \"${ocio_dep_MIN_VERSION}\", recommended version: \"${ocio_dep_RECOMMENDED_VERSION}\", max: \"${ocio_dep_MAX_VERSION}\")${ColorReset}")
+    endif()
+endmacro()
+
+# Using a macro because OCIO wants the scope to be the same as the caller scope.
+# Find modules will set variables via the PARENT_SCOPE option and OCIO needs those 
+# variables in the caller scope.
+
+###################################################################################################
+# ocio_handle_dependency is a wrapper for find_package with extra options (features).
+#
+# Argument:
+#   dep_name is the name of the dependency (package). Please note that dep_name is case sensitive.
+#
+# Options (no value): 
+#   REQUIRED                        - Whether the dependency is required or not. Fails if missing.
+#   PREFER_CONFIG                   - Call find_package in CONFIG mode internally. 
+#                                     Note that it tries to find <dep_name>Config.cmake or 
+#                                     <dep_name>-Config.cmake directly.
+#   ALLOW_INSTALL                   - Try to install the dependency if not found. Note that a 
+#                                     corresponding Install<dep_name>.cmake file must exist.
+#   VERBOSE                         - Enable extra logging.
+#
+# Options (one value):
+#   MIN_VERSION                     - Minimum version for the dependency.
+#   MAX_VERSION                     - Maximum version for the dependency.
+#   RECOMMENDED_VERSION         - Recommended version for the dependency.
+#   RECOMMENDED_VERSION_REASON  - Reason for the recommended version.
+#
+# Options (multiple values):
+#   VERSION_VARS                    - List of version variables. Default to <dep_name>_VERSION.
+#   COMPONENTS                      - List of components to find. Just like find_package option.
+#   PROMOTE_TARGET                  - List of targets that needs to be globally visible by setting 
+#                                     the targets property IMPORTED_GLOBAL to true. Note that the
+#                                     targets must be created by the dependency.
+#
+# This is a macro because OCIO wants the scope to be the same as the caller scope.
+# Find modules will set variables via the PARENT_SCOPE option and OCIO needs those 
+# variables in the caller scope.
+#
+###################################################################################################
+
+macro (ocio_handle_dependency dep_name)
+    cmake_parse_arguments(
+        # prefix
+        ocio_dep
+        # options
+        "REQUIRED;PREFER_CONFIG;ALLOW_INSTALL;VERBOSE"
+        # one value keywords
+        "MIN_VERSION;MAX_VERSION;RECOMMENDED_VERSION;RECOMMENDED_VERSION_REASON;PROMOTE_TARGET"
+        # multi value keywords
+        "VERSION_VARS;COMPONENTS"
+        # args
+        ${ARGN})
+
+    set(ocio_dep_FORCE_INSTALLATION OFF)
+    if(ocio_dep_ALLOW_INSTALL AND OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
+        set(ocio_dep_FORCE_INSTALLATION ON)
+    endif()
+    
+    if(NOT ocio_dep_FORCE_INSTALLATION)
+        set(ocio_dep_QUIET_string "")
+        # Do not set to QUIET when OCIO_VERBOSE is ON.
+        if(NOT ocio_dep_VERBOSE AND NOT OCIO_VERBOSE)
+            set(${dep_name}_FIND_QUIETLY true)
+            set(ocio_dep_QUIET_string "QUIET")
+        endif()
+
+        set(ocio_dep_CONFIG_string "")
+        if(ocio_dep_PREFER_CONFIG)
+            set(ocio_dep_CONFIG_string "CONFIG")
+        endif()
+
+        set(ocio_dep_COMPONENTS_string "")
+        if(ocio_dep_COMPONENTS)
+            set(ocio_dep_COMPONENTS_string "COMPONENTS")
+        endif()
+
+        if (${dep_name}_FOUND)
+            # Nothing to do. Already found.
+        else()
+            # Try to find the recommended, or higher, version.
+            # Note that the recommended version should always be specified.
+            find_package(${dep_name}
+                        ${ocio_dep_RECOMMENDED_VERSION}
+                        ${ocio_dep_CONFIG_string}
+                        ${ocio_dep_COMPONENTS_string} 
+                        ${ocio_dep_COMPONENTS}
+                        ${ocio_dep_QUIET_string}
+                        ${ocio_dep_UNPARSED_ARGUMENTS})
+
+            if (NOT ${dep_name}_FOUND AND ocio_dep_PREFER_CONFIG)
+                # Try find_package in module mode instead of config mode.
+                find_package(${dep_name}
+                    ${ocio_dep_RECOMMENDED_VERSION}
+                    ${ocio_dep_COMPONENTS_string} 
+                    ${ocio_dep_COMPONENTS}
+                    ${ocio_dep_QUIET_string}
+                    ${ocio_dep_UNPARSED_ARGUMENTS})
+            endif()
+            
+            if(NOT ${dep_name}_FOUND AND ocio_dep_MIN_VERSION
+            AND NOT ocio_dep_MIN_VERSION VERSION_EQUAL ocio_dep_MAX_VERSION
+            AND NOT ocio_dep_MIN_VERSION VERSION_EQUAL ocio_dep_RECOMMENDED_VERSION)
+
+                # if the recommended, or higher, version is not found, try to find dependency with 
+                # the minimum version.
+                find_package(${dep_name} 
+                            ${ocio_dep_MIN_VERSION} 
+                            ${ocio_dep_CONFIG_string}
+                            ${ocio_dep_COMPONENTS_string} 
+                            ${ocio_dep_COMPONENTS}
+                            ${ocio_dep_QUIET_string}
+                            ${ocio_dep_UNPARSED_ARGUMENTS})
+                if (NOT ${dep_name}_FOUND AND ocio_dep_PREFER_CONFIG)
+                    # Try find_package in module mode instead of config mode.
+                    find_package(${dep_name} 
+                        ${ocio_dep_MIN_VERSION}
+                        ${ocio_dep_COMPONENTS_string} 
+                        ${ocio_dep_COMPONENTS}
+                        ${ocio_dep_QUIET_string}
+                        ${ocio_dep_UNPARSED_ARGUMENTS})
+                endif()
+            endif()
+        endif()
+        
+        # Check which VERSION_VARS was set by find_package.
+        set(_VERSION_VAR "${dep_name}_VERSION")
+        foreach (_vervar ${ocio_dep_VERSION_VARS})
+            if(${_vervar})
+                set(_VERSION_VAR ${_vervar})
+                break()
+            endif()
+        endforeach()
+
+        if(_VERSION_VAR)
+            set(ocio_dep_VERSION ${${_VERSION_VAR}})
+        endif()
+
+        # Expecting that the minimum and recommended version are always provided.
+        # Make sure that the version is within the valid range.
+        if(${dep_name}_FOUND)
+            if (ocio_dep_VERSION)
+                # Make sure that the version found is not greater than the maximum version.
+                if(DEFINED ocio_dep_MAX_VERSION)
+                    if(ocio_dep_VERSION VERSION_GREATER ocio_dep_MAX_VERSION)
+                        # Display it as an error, but do not abort right now.
+                        message(SEND_ERROR "${ColorError}Found ${dep_name} ${ocio_dep_VERSION}, but it is over the maximum version \"${ocio_dep_MAX_VERSION}\" ${ColorReset}")
+                        set(_${dep_name}_found_displayed true)
+                    endif()
+                endif()
+                
+                if(DEFINED ocio_dep_RECOMMENDED_VERSION)
+                    if (ocio_dep_VERSION VERSION_LESS ocio_dep_RECOMMENDED_VERSION)
+                        message(STATUS "${ColorSuccess}Found ${dep_name} (version \"${ocio_dep_VERSION}\") (recommended version: \"${ocio_dep_RECOMMENDED_VERSION}\")${ColorReset}")
+                        if (ocio_dep_RECOMMENDED_VERSION_REASON)
+                            message(STATUS "   Reason: ${ocio_dep_RECOMMENDED_VERSION_REASON}")
+                        endif()
+                        set(_${dep_name}_found_displayed true)
+                    endif()
+                endif()
+                
+                if(NOT _${dep_name}_found_displayed)
+                    message(STATUS "${ColorSuccess}Found ${dep_name} (version \"${ocio_dep_VERSION}\")${ColorReset}")
+                endif()
+            else()
+                message(STATUS "${ColorSuccess}Found ${dep_name} (no version information)${ColorReset}")
+            endif()
+        else()
+            if(ocio_dep_REQUIRED AND NOT ocio_dep_ALLOW_INSTALL)
+                set(message_color "${ColorError}")
+            else()
+                set(message_color "${ColorReset}")
+            endif()
+
+            ocio_print_versions_error(${dep_name} ${message_color})
+
+            if(${dep_name}_ROOT)
+                message(STATUS "${message_color}   ${dep_name}_ROOT was: ${${dep_name}_ROOT} ${ColorReset}")
+            elseif($ENV{${dep_name}_ROOT})
+                message(STATUS "${message_color}   ENV ${dep_name}_ROOT was: ${${dep_name}_ROOT} ${ColorReset}")
+            endif()
+
+            if(ocio_dep_ALLOW_INSTALL)
+                ocio_install_dependency(${dep_name} VERSION ${ocio_dep_RECOMMENDED_VERSION})
+            endif()
+
+            if(ocio_dep_REQUIRED)
+                if(NOT ${dep_name}_FOUND AND NOT ocio_dep_VERSION)
+                    message(SEND_ERROR "${ColorError}${dep_name} is required, will abort at the end.${ColorReset}")
+                endif()
+            endif()
+        endif()
+    elseif(ocio_dep_FORCE_INSTALLATION)
+        # Skip the search and install dependency right away.
+        ocio_install_dependency(${dep_name} VERSION ${ocio_dep_RECOMMENDED_VERSION})
+    endif()
+
+    if(${dep_name}_FOUND AND ocio_dep_PROMOTE_TARGET)
+        foreach (_target_to_be_promoted_ ${ocio_dep_PROMOTE_TARGET})
+            set_target_properties(${_target_to_be_promoted_} PROPERTIES IMPORTED_GLOBAL TRUE)
+        endforeach()
+    endif()
+endmacro()
\ No newline at end of file
diff --git a/share/cmake/macros/ocio_install_dependency.cmake b/share/cmake/macros/ocio_install_dependency.cmake
new file mode 100644
index 0000000000..f5b6a4e893
--- /dev/null
+++ b/share/cmake/macros/ocio_install_dependency.cmake
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+###################################################################################################
+# ocio_install_dependency installs a dependency by calling the corresponding Install module.
+# e.g. Install<dep_name>.cmake
+#
+# Argument:
+#   dep_name is the name of the dependency (package). Please note that dep_name is case sensitive.
+#
+# Options (one value):
+#   VERSION                     - Version to install.
+###################################################################################################
+macro (ocio_install_dependency dep_name)
+    cmake_parse_arguments(
+        # prefix - Must be different than the one used in ocio_handle_dependency.cmake.
+        ocio_id
+        # options
+        ""
+        # one value keywords
+        "VERSION;PROMOTE_TARGET"
+        # multi value keywords
+        ""
+        # args
+        ${ARGN})
+
+
+    if(NOT ${dep_name}_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+        set(OCIO_${dep_name}_RECOMMENDED_VERSION ${ocio_id_VERSION})
+        include(Install${dep_name})
+        set(_${dep_name}_built_by_ocio TRUE)
+    endif()
+
+    if(${dep_name}_FOUND)
+        if(${dep_name}_FOUND AND ocio_id_PROMOTE_TARGET)
+            foreach (_target_to_be_promoted_ ${ocio_id_PROMOTE_TARGET})
+                set_target_properties(${_target_to_be_promoted_} PROPERTIES IMPORTED_GLOBAL TRUE)
+            endforeach()
+        endif()
+
+        if (${_${dep_name}_built_by_ocio})
+           message(STATUS "${ColorSuccess}Installed ${dep_name} (version \"${ocio_id_VERSION}\")${ColorReset}")
+        endif()
+    endif()
+    unset(_${dep_name}_built_by_ocio)
+endmacro()
\ No newline at end of file
diff --git a/share/cmake/modules/FindExtPackages.cmake b/share/cmake/modules/FindExtPackages.cmake
index 25cfacb497..870b039bfc 100644
--- a/share/cmake/modules/FindExtPackages.cmake
+++ b/share/cmake/modules/FindExtPackages.cmake
@@ -6,6 +6,9 @@
 # by the OCIO_INSTALL_EXT_PACKAGES option.
 #
 
+include(Colors)
+include(ocio_handle_dependency)
+
 ###############################################################################
 ### Global package options ###
 
@@ -32,116 +35,95 @@ if (APPLE)
     set(CMAKE_FIND_APPBUNDLE LAST)
 endif()
 
+
+message(STATUS "")
+message(STATUS "Missing a dependency? Try the following possibilities:")
+message(STATUS "If the package provides CMake's configuration file, use -D<pkg>_DIR=<path to folder>.")
+message(STATUS "If it doesn't provide it, try -D<pkg>_ROOT=<path to folder with lib and includes>.")
+message(STATUS "Alternatively, try -D<pkg>_LIBRARY=<path to lib file> and/or -D<pkg>_INCLUDE_DIR=<path to folder>.")
+message(STATUS "")
+message(STATUS "Please refer to the find module under share/cmake/modules for extra information.")
+
 ###############################################################################
-### Packages and versions ###
+##
+## Required dependencies
+##
+###############################################################################
+message(STATUS "")
+message(STATUS "Checking for mandatory dependencies...")
 
 # expat
 # https://github.com/libexpat/libexpat
-find_package(expat 2.4.1 REQUIRED)
+ocio_handle_dependency(  expat REQUIRED ALLOW_INSTALL
+                         MIN_VERSION 2.4.1
+                         RECOMMENDED_VERSION 2.5.0
+                         RECOMMENDED_VERSION_REASON "CVE fixes and fix issue with symbol leakage when built as a static library")
 
 # yaml-cpp
 # https://github.com/jbeder/yaml-cpp
-find_package(yaml-cpp 0.7.0 REQUIRED)
+ocio_handle_dependency(  yaml-cpp REQUIRED ALLOW_INSTALL
+                         MIN_VERSION 0.6.3
+                         RECOMMENDED_VERSION 0.7.0
+                         RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
 
 # pystring
 # https://github.com/imageworks/pystring
-find_package(pystring 1.1.3 REQUIRED)
+ocio_handle_dependency(  pystring REQUIRED ALLOW_INSTALL
+                         MIN_VERSION 1.1.3
+                         RECOMMENDED_VERSION 1.1.3
+                         RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
 
 # Imath (>=3.1)
 # https://github.com/AcademySoftwareFoundation/Imath
-set(_Imath_ExternalProject_VERSION "3.1.6")
-find_package(Imath 3.0 REQUIRED)
+ocio_handle_dependency(  Imath REQUIRED ALLOW_INSTALL
+                         MIN_VERSION 3.1.1
+                         RECOMMENDED_VERSION 3.1.6
+                         RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
 
 ###############################################################################
-### ZLIB (https://github.com/madler/zlib)
-###
-### The following variables can be set:
-### ZLIB_ROOT               Location of ZLIB library file and includes folder.
-###                         Alternatively, ZLIB_LIBRARY and ZLIB_INCLUDE_DIR can be used.
-###
-### ZLIB_LIBRARY            Location of ZLIB library file.
-### ZLIB_INCLUDE_DIR        Location of ZLIB includes folder.
-###
-### ZLIB_VERSION            ZLIB Version (CMake 3.26+)
-### ZLIB_VERSION_STRING     ZLIB Version (CMake < 3.26)
-###
+# ZLIB (https://github.com/madler/zlib)
+#
+# The following variables can be set:
+# ZLIB_ROOT               Location of ZLIB library file and includes folder.
+#                         Alternatively, ZLIB_LIBRARY and ZLIB_INCLUDE_DIR can be used.
+#
+# ZLIB_LIBRARY            Location of ZLIB library file.
+# ZLIB_INCLUDE_DIR        Location of ZLIB includes folder.
+#
+# ZLIB_VERSION            ZLIB Version (CMake 3.26+)
+# ZLIB_VERSION_STRING     ZLIB Version (CMake < 3.26)
+#
+#
+# ZLIB_USE_STATIC_LIBS    Set to ON if static library is prefered (CMake 3.24+)
+#
 ###############################################################################
 # ZLIB 1.2.13 is used since it fixes a critical vulnerability.
 # See https://nvd.nist.gov/vuln/detail/CVE-2022-37434
 # See https://github.com/madler/zlib/releases/tag/v1.2.13
-set(_ZLIB_FIND_VERSION "1.2.13")
-set(_ZLIB_ExternalProject_VERSION ${_ZLIB_FIND_VERSION})
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-    # ZLIB_USE_STATIC_LIBS is supported only from CMake 3.24+.
-    if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") 
-        if (ZLIB_STATIC_LIBRARY)
-            set(ZLIB_USE_STATIC_LIBS "${ZLIB_STATIC_LIBRARY}")
-        endif()
-    else() # For CMake < 3.24 since ZLIB_USE_STATIC_LIBS is not available.
-        if(NOT ZLIB_LIBRARY)
-            if(DEFINED CMAKE_FIND_LIBRARY_PREFIXES)
-                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}")
-            else()
-                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
-            endif()
-
-            if(DEFINED CMAKE_FIND_LIBRARY_SUFFIXES)
-                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_FIND_LIBRARY_SUFFIXES}")
-            else()
-                set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
-            endif()
-
-            # Prefix/suffix for windows.
-            if(WIN32)
-                list(APPEND CMAKE_FIND_LIBRARY_PREFIXES "" "lib")
-                list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES ".dll.a")
-            endif()
-
-            # Check if static lib is preferred.
-            if(ZLIB_STATIC_LIBRARY OR ZLIB_USE_STATIC_LIBS)
-                if(WIN32)
-                    set(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
-                else()
-                    set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
-                endif()
-            endif()
-        endif()
-    endif()
-
-    set(_ZLIB_REQUIRED REQUIRED)
-    # Override REQUIRED if package can be installed
-    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
-        set(_ZLIB_REQUIRED "")
-    endif()
-
-    find_package(ZLIB ${_ZLIB_FIND_VERSION} ${_ZLIB_REQUIRED})
-
-    # Restore the original find library ordering
-    if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
-        set(CMAKE_FIND_LIBRARY_SUFFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}")
-    else()
-        set(CMAKE_FIND_LIBRARY_SUFFIXES)
-    endif()
+ocio_handle_dependency(  ZLIB REQUIRED ALLOW_INSTALL
+                         MIN_VERSION 1.2.10
+                         RECOMMENDED_VERSION 1.2.13
+                         RECOMMENDED_VERSION_REASON "CVE fixes"
+                         VERSION_VARS ZLIB_VERSION_STRING ZLIB_VERSION )
 
-    if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
-        set(CMAKE_FIND_LIBRARY_PREFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES}")
-    else()
-        set(CMAKE_FIND_LIBRARY_PREFIXES)
-    endif()
-endif()
-
-if(NOT ZLIB_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(InstallZLIB)
-endif()
 ###############################################################################
 
 # minizip-ng
 # https://github.com/zlib-ng/minizip-ng
-find_package(minizip-ng 3.0.7 REQUIRED)
+ocio_handle_dependency(  minizip-ng REQUIRED ALLOW_INSTALL
+                         MIN_VERSION 3.0.6
+                         RECOMMENDED_VERSION 3.0.7
+                         RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
 
-if(OCIO_BUILD_APPS)
+###############################################################################
+##
+## Optional dependencies
+##
+###############################################################################
+message(STATUS "")
+message(STATUS "Checking for optional dependencies...")
 
+if(OCIO_BUILD_APPS)
     # NOTE: Depending of the compiler version lcms2 2.2 does not compile with 
     # C++17 so, if you change the lcms2 version update the code to compile 
     # lcms2 and dependencies with C++17 or higher i.e. remove the cap of C++ 
@@ -149,13 +131,19 @@ if(OCIO_BUILD_APPS)
 
     # lcms2
     # https://github.com/mm2/Little-CMS
-    find_package(lcms2 2.2 REQUIRED)
+    ocio_handle_dependency(  lcms2 REQUIRED ALLOW_INSTALL
+                             MIN_VERSION 2.2
+                             RECOMMENDED_VERSION 2.2
+                             RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
 endif()
 
 if(OCIO_BUILD_OPENFX)
     # openfx
     # https://github.com/ofxa/openfx
-    find_package(openfx 1.4 REQUIRED)
+    ocio_handle_dependency(  openfx REQUIRED ALLOW_INSTALL
+                             MIN_VERSION 1.4
+                             RECOMMENDED_VERSION 1.4
+                             RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
 endif()
 
 if (OCIO_PYTHON_VERSION AND NOT OCIO_BUILD_PYTHON)
@@ -177,44 +165,110 @@ if(OCIO_BUILD_PYTHON OR OCIO_BUILD_DOCS)
         list(APPEND _Python_COMPONENTS Development)
     endif()
 
+
     # Python
-    find_package(Python ${OCIO_PYTHON_VERSION} REQUIRED
-                 COMPONENTS ${_Python_COMPONENTS})
+    ocio_handle_dependency(  Python REQUIRED
+                             COMPONENTS ${_Python_COMPONENTS}
+                             MIN_VERSION ${OCIO_PYTHON_VERSION}
+                             RECOMMENDED_VERSION ${OCIO_PYTHON_VERSION}
+                             RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
 
     if(OCIO_BUILD_PYTHON)
         # pybind11
         # https://github.com/pybind/pybind11
         # pybind11 2.9 fixes issues with MS Visual Studio 2022 (Debug).
-        find_package(pybind11 2.9.2 REQUIRED)
+        ocio_handle_dependency(  pybind11 REQUIRED ALLOW_INSTALL
+                                 MIN_VERSION 2.9.2
+                                 RECOMMENDED_VERSION 2.9.2
+                                 RECOMMENDED_VERSION_REASON "Pybind 2.10.0+ does not work with Python 2.7 anymore")
     endif()
 endif()
 
+# Set OpenEXR Minimum version as a variable since it it used at multiple places.
+set(OpenEXR_MININUM_VERSION "3.0.5")
 if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS)
     # OpenImageIO is required for OSL unit test and optional for apps.
 
     # OpenImageIO
     # https://github.com/OpenImageIO/oiio
     set(OIIO_VERSION "2.2.14")
-
-    if(OCIO_USE_OIIO_CMAKE_CONFIG)
-        # TODO: Try when OIIO 2.4 is released (https://github.com/OpenImageIO/oiio/pull/3322).
-        # set(OPENIMAGEIO_CONFIG_DO_NOT_FIND_IMATH 1)
-        find_package(OpenImageIO ${OIIO_VERSION} CONFIG)
+    set(OIIO_RECOMMENDED_VERSION "2.4")
+
+    # Supported from OIIO 2.4+. Setting this for lower versions doesn't affect anything.
+    set(OPENIMAGEIO_CONFIG_DO_NOT_FIND_IMATH 1)
+
+    include(ocio_check_dependency_version)
+    # Since OpenImageIO will try to find OpenEXR through its OpenImageIOConfig.cmake file,
+    # let's try to find OpenEXR first and if the version is too old, OCIO will not try to find
+    # OpenImageIO.
+    ocio_check_dependency_version(  OpenEXR "is_OpenEXR_VERSION_valid" 
+                                    MIN_VERSION ${OpenEXR_MININUM_VERSION} 
+                                    CONFIG)
+
+    # Do not try to find OpenImageIO if the version of OpenEXR is too old.                                    
+    if (is_OpenEXR_VERSION_valid)
+        ###############################################################################
+        # OpenImageIO (https://github.com/OpenImageIO/oiio)
+        #
+        # Variables defined by OpenImageIO CMake's configuration files:
+        #   OpenImageIO_FOUND          - Indicate whether the library was found or not
+        #   OpenImageIO_LIB_DIR        - Library's directory
+        #   OpenImageIO_INCLUDE_DIR    - Location of the header files
+        #   OpenImageIO_VERSION        - Library's version
+        #
+        # Imported targets defined by this module, if found:
+        #   OpenImageIO::OpenImageIO
+        #   OpenImageIO::OpenImageIO_Util
+        #
+        ###############################################################################
+        # Calling find_package in CONFIG mode using PREFER_CONFIG option as OIIO support
+        # config file since 2.1+ and OCIO minimum version is over that.
+        ocio_handle_dependency(  OpenImageIO PREFER_CONFIG
+                                 MIN_VERSION ${OIIO_VERSION}
+                                 RECOMMENDED_VERSION ${OIIO_RECOMMENDED_VERSION}
+                                 PROMOTE_TARGET OpenImageIO::OpenImageIO)
     else()
-        find_package(OpenImageIO ${OIIO_VERSION})
+        message(WARNING "Skipping OpenImageIO because the OpenEXR found by OpenImageIO is too old (under ${OpenEXR_MININUM_VERSION})")
     endif()
 endif()
 
 if(OCIO_BUILD_APPS)
 
     if(OCIO_USE_OIIO_FOR_APPS AND OpenImageIO_FOUND AND TARGET OpenImageIO::OpenImageIO)
+        if (USE_MSVC AND OCIO_IMAGE_BACKEND STREQUAL "OpenImageIO")
+            # Temporary until fixed in OpenImageIO: Mute some warnings from OpenImageIO farmhash.h
+            # C4267 (level 3)	    'var' : conversion from 'size_t' to 'type', possible loss of data
+            # C4244	(level 3 & 4)	'conversion' conversion from 'type1' to 'type2', possible loss of data
+            target_compile_options(OpenImageIO::OpenImageIO PRIVATE /wd4267 /wd4244)
+        endif()
+        
         add_library(OpenColorIO::ImageIOBackend ALIAS OpenImageIO::OpenImageIO)
         set(OCIO_IMAGE_BACKEND OpenImageIO)
     else()
-        # OpenEXR
-        # https://github.com/AcademySoftwareFoundation/openexr
-        set(_OpenEXR_ExternalProject_VERSION "3.1.5")
-        find_package(OpenEXR 3.0)
+        ###############################################################################
+        # OpenEXR (https://github.com/AcademySoftwareFoundation/openexr)
+        #
+        # Variables defined by OpenEXR CMake's configuration files:
+        #   OpenEXR_FOUND          - Indicate whether the library was found or not
+        #   OpenEXR_VERSION        - Library's version
+        #
+        # Imported targets defined by this module, if found:
+        #   OpenEXR::Iex
+        #   OpenEXR::IexConfig
+        #   OpenEXR::IlmThread
+        #   OpenEXR::IlmThreadConfig
+        #   OpenEXR::OpenEXR
+        #   OpenEXR::OpenEXRConfig
+        #   OpenEXR::OpenEXRCore
+        #   OpenEXR::OpenEXRUtil
+        #
+        ###############################################################################
+        # Calling find_package in CONFIG mode using PREFER_CONFIG option.
+        ocio_handle_dependency(  OpenEXR PREFER_CONFIG ALLOW_INSTALL
+                                 MIN_VERSION ${OpenEXR_MININUM_VERSION}
+                                 RECOMMENDED_VERSION 3.1.5
+                                 RECOMMENDED_VERSION_REASON "Latest version tested with OCIO"
+                                 PROMOTE_TARGET OpenEXR::OpenEXR)
 
         if(OpenEXR_FOUND AND TARGET OpenEXR::OpenEXR)
             add_library(OpenColorIO::ImageIOBackend ALIAS OpenEXR::OpenEXR)
@@ -236,15 +290,18 @@ if(OCIO_BUILD_TESTS)
         if(TARGET Imath::Imath)
             # OpenShadingLanguage
             # https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
-            find_package(OpenShadingLanguage 1.11)
+            ocio_handle_dependency(  OSL
+                                     MIN_VERSION 1.11
+                                     RECOMMENDED_VERSION 1.11
+                                     RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
             if(NOT OSL_FOUND)
-                message(WARNING "Could NOT find OpenShadingLanguage. Skipping build of the OSL unit tests.")
+                message(WARNING "Skipping build of the OpenShadingLanguage unit tests (OSL missing)")
             endif()
         else()
-            message(WARNING "Could NOT find Imath. Skipping build of the OSL unit tests.")
+            message(WARNING "Skipping build of the OpenShadingLanguage unit tests (Imath missing)")
         endif()
     else()
-        message(WARNING "Could NOT find OpenImageIO. Skipping build of the OSL unit tests.")
+        message(WARNING "Skipping build of the OpenShadingLanguage unit tests (OpenImageIO missing)")
     endif()
 endif()
 
diff --git a/share/cmake/modules/FindImath.cmake b/share/cmake/modules/FindImath.cmake
index 30f5b4e465..6126b934f7 100644
--- a/share/cmake/modules/FindImath.cmake
+++ b/share/cmake/modules/FindImath.cmake
@@ -1,25 +1,27 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install Imath
+# Locate Imath
 #
 # Variables defined by this module:
-#   Imath_FOUND - If FALSE, do not try to link to ilmbase
-#   Imath_LIBRARY - Imath library to link to
-#   Imath_INCLUDE_DIR - Where to find ImathConfig.h
-#   Imath_VERSION - The version of the library
+#   Imath_FOUND          - Indicate whether the library was found or not
+#   Imath_LIBRARY        - Path to the library file
+#   Imath_INCLUDE_DIR    - Location of the header files
+#   Imath_VERSION        - Library's version
 #
-# Targets defined by this module:
-#   Imath::Imath - IMPORTED target, if found
+# Global targets defined by this module:
+#   Imath::Imath
 #
-# By default, the dynamic libraries of Imath will be found. To find the 
-# static ones instead, you must set the Imath_STATIC_LIBRARY variable to 
-# TRUE before calling find_package(Imath ...).
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+# In this case, you may set Imath_STATIC_LIBRARY to ON to request use of the static one. 
+# If only the static library is present (such as when OCIO builds the dependency), then the option 
+# is not needed.
 #
-# If Imath is not installed in a standard path, you can use the 
-# Imath_ROOT variable to tell CMake where to find it. If it is not found 
-# and OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, Imath will be 
-# downloaded, built, and statically-linked into libOpenColorIO at build time.
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -DImath_DIR to point to the directory containing the CMake configuration file for the package.
+# -- Set -DImath_ROOT to point to the directory containing the lib and include directories.
+# -- Set -DImath_LIBRARY and -DImath_INCLUDE_DIR to point to the lib and include directories.
 #
 
 ###############################################################################
@@ -131,122 +133,12 @@ endif()
 ###############################################################################
 ### Create target
 
-if (NOT TARGET Imath::Imath)
+if (Imath_FOUND AND NOT TARGET Imath::Imath)
     add_library(Imath::Imath UNKNOWN IMPORTED GLOBAL)
     add_library(Imath::ImathConfig INTERFACE IMPORTED GLOBAL)
     set(_Imath_TARGET_CREATE TRUE)
 endif()
 
-###############################################################################
-### Install package from source ###
-
-if(NOT Imath_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-    include(GNUInstallDirs)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(Imath_FOUND TRUE)
-    if(_Imath_ExternalProject_VERSION)
-        set(Imath_VERSION ${_Imath_ExternalProject_VERSION})
-    else()
-        set(Imath_VERSION ${Imath_FIND_VERSION})
-    endif()
-    set(Imath_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
-
-    # Set the expected library name
-    if(BUILD_TYPE_DEBUG)
-        set(_Imath_LIB_SUFFIX "_d")
-    endif()
-
-    include(VersionUtils)
-    split_version_string(${Imath_VERSION} _Imath_ExternalProject)
-
-    set(_Imath_LIB_VER "${_Imath_ExternalProject_VERSION_MAJOR}_${_Imath_ExternalProject_VERSION_MINOR}")
-
-    set(Imath_LIBRARY
-        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}Imath-${_Imath_LIB_VER}${_Imath_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-    if(_Imath_TARGET_CREATE)
-        if(MSVC)
-            set(Imath_CXX_FLAGS "${Imath_CXX_FLAGS} /EHsc")
-        endif()
-
-        string(STRIP "${Imath_CXX_FLAGS}" Imath_CXX_FLAGS)
- 
-        set(Imath_CMAKE_ARGS
-            ${Imath_CMAKE_ARGS}
-            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
-            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_CXX_FLAGS=${Imath_CXX_FLAGS}
-            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-            -DBUILD_SHARED_LIBS=OFF
-            -DBUILD_TESTING=OFF
-            -DPYTHON=OFF
-            -DDOCS=OFF
-            -DIMATH_HALF_USE_LOOKUP_TABLE=OFF
-        )
-
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(Imath_CMAKE_ARGS
-                ${Imath_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
-
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-            set(Imath_CMAKE_ARGS
-                ${Imath_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-            )
-        endif()
-
-        if (ANDROID)
-            set(Imath_CMAKE_ARGS
-                ${Imath_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
-        endif()
-
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${Imath_INCLUDE_DIR})
-
-        ExternalProject_Add(imath_install
-            GIT_REPOSITORY "https://github.com/AcademySoftwareFoundation/Imath.git"
-            GIT_TAG "v${Imath_VERSION}"
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/Imath"
-            BUILD_BYPRODUCTS ${Imath_LIBRARY}
-            CMAKE_ARGS ${Imath_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-            BUILD_COMMAND ""
-            INSTALL_COMMAND
-                ${CMAKE_COMMAND} --build .
-                                 --config ${CMAKE_BUILD_TYPE}
-                                 --target install
-                                 --parallel
-        )
-
-        add_dependencies(Imath::Imath imath_install)
-
-        message(STATUS "Installing Imath: ${Imath_LIBRARY} (version \"${Imath_VERSION}\")")
-    endif()
-endif()
-
 ###############################################################################
 ### Configure target ###
 
@@ -262,4 +154,4 @@ if(_Imath_TARGET_CREATE)
     )
 
     mark_as_advanced(Imath_INCLUDE_DIR Imath_LIBRARY Imath_VERSION)
-endif()
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/FindOpenShadingLanguage.cmake b/share/cmake/modules/FindOSL.cmake
similarity index 62%
rename from share/cmake/modules/FindOpenShadingLanguage.cmake
rename to share/cmake/modules/FindOSL.cmake
index f73ffdf9b9..5cefe410bc 100644
--- a/share/cmake/modules/FindOpenShadingLanguage.cmake
+++ b/share/cmake/modules/FindOSL.cmake
@@ -1,51 +1,54 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install OpenShadingLanguage (OSL)
+# Locate OpenShadingLanguage (OSL)
 #
-# If the library is not installed in a standard path, you can use the OSL_ROOT 
-# variable to tell CMake where to find it. 
+# Variables defined by this module:
+#   OSL_FOUND                          - Indicate whether the library was found or not
+#   OSL_LIB_DIR                        - Location of the libary files
+#   OSL_INCLUDE_DIR                    - Location of the header files
+#   OSL_VERSION                        - Library's version
+#   OSL_SHADERS_INCLUDE_DIR            - Location of the shader's header files
+#   OSL_SHADERS_DIR                    - Used for OSL unit tests
+#   
+#   These variables are set only when OSL_ROOT is provided:
+#   oslcomp_LIBRARY                    - Path to the library file
+#   oslexec_LIBRARY                    - Path to the library file
+#
+# Global targets defined by this module:
+#   osl::osl
+#
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+#
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -DOpenShadingLanguage_DIR to point to the directory containing the CMake configuration file for the package.
+# -- Set -DOpenShadingLanguage_ROOT to point to the directory containing the lib and include directories.
 #
-
-
-if(NOT TARGET osl::osl)
-    add_library(osl::osl INTERFACE IMPORTED GLOBAL)
-    set(OSL_FOUND OFF)
-endif()
 
 ###############################################################################
 ### Try to find package ###
 
 if(NOT DEFINED OSL_ROOT)
-
-    find_package(OSL ${OpenShadingLanguage_FIND_VERSION} CONFIG)
-    
-    set(OpenShadingLanguage_VERSION ${OSL_VERSION})
-
-    # TODO: No variable to have the share directory?
+    find_package(OSL ${OSL_FIND_VERSION} CONFIG QUIET)
 
     set(OSL_SHADERS_INCLUDE_DIR ${OSL_INCLUDE_DIR}/../share)
-
     # Variable used by the OSL unit tests. 
     set(OSL_SHADERS_DIR ${OSL_SHADERS_INCLUDE_DIR}/OSL/shaders)
 
     include (FindPackageHandleStandardArgs)
-    find_package_handle_standard_args (OpenShadingLanguage
-        FOUND_VAR     OpenShadingLanguage_FOUND
-        REQUIRED_VARS OSL_INCLUDE_DIR OSL_LIB_DIR OpenShadingLanguage_VERSION
-        VERSION_VAR   OpenShadingLanguage_VERSION
+    find_package_handle_standard_args (OSL
+        REQUIRED_VARS 
+            OSL_INCLUDE_DIR 
+            OSL_LIB_DIR 
+        VERSION_VAR   
+            OSL_VERSION
     )
-
-    set(OSL_FOUND ${OpenShadingLanguage_FOUND})
-
 else()
-
     set(OSL_INCLUDE_DIR ${OSL_ROOT}/include)
-
     set(OSL_VERSION_HEADER "${OSL_INCLUDE_DIR}/OSL/oslversion.h")
 
     if(EXISTS "${OSL_VERSION_HEADER}")
-
         # Try to figure out version number
         file (STRINGS "${OSL_VERSION_HEADER}" TMP REGEX "^#define OSL_LIBRARY_VERSION_MAJOR .*$")
         string (REGEX MATCHALL "[0-9]+" OSL_VERSION_MAJOR ${TMP})
@@ -54,12 +57,14 @@ else()
         file (STRINGS "${OSL_VERSION_HEADER}" TMP REGEX "^#define OSL_LIBRARY_VERSION_PATCH .*$")
         string (REGEX MATCHALL "[0-9]+" OSL_VERSION_PATCH ${TMP})
         file (STRINGS "${OSL_VERSION_HEADER}" TMP REGEX "^#define OSL_LIBRARY_VERSION_TWEAK .*$")
+
         if (TMP)
             string (REGEX MATCHALL "[0-9]+" OSL_VERSION_TWEAK ${TMP})
         else ()
             set (OSL_VERSION_TWEAK 0)
         endif ()
-        set (OpenShadingLanguage_VERSION "${OSL_VERSION_MAJOR}.${OSL_VERSION_MINOR}.${OSL_VERSION_PATCH}.${OSL_VERSION_TWEAK}")
+
+        set (OSL_VERSION "${OSL_VERSION_MAJOR}.${OSL_VERSION_MINOR}.${OSL_VERSION_PATCH}.${OSL_VERSION_TWEAK}")
 
         # Find the oslcomp library.
         find_library(oslcomp_LIBRARY
@@ -92,58 +97,41 @@ else()
         )
 
         set(OSL_SHADERS_INCLUDE_DIR ${OSL_ROOT}/share)
-
         # Variable used by the OSL unit tests. 
         set(OSL_SHADERS_DIR ${OSL_SHADERS_INCLUDE_DIR}/OSL/shaders)
-
-        if(EXISTS "${OSL_SHADERS_DIR}")
-
-            # Variable used by the OSL unit tests.
-            set(OSL_FOUND ON)
-
-        endif()
-
     endif()
 
     include (FindPackageHandleStandardArgs)
-    find_package_handle_standard_args (OpenShadingLanguage
-        FOUND_VAR     OpenShadingLanguage_FOUND
-        REQUIRED_VARS OSL_INCLUDE_DIR oslcomp_LIBRARY oslexec_LIBRARY OpenShadingLanguage_VERSION
-        VERSION_VAR   OpenShadingLanguage_VERSION
+    find_package_handle_standard_args (OSL
+        REQUIRED_VARS 
+            OSL_INCLUDE_DIR
+            OSL_SHADERS_DIR
+            oslcomp_LIBRARY 
+            oslexec_LIBRARY 
+        VERSION_VAR   
+            OSL_VERSION
     )
-
-    set(OSL_FOUND ${OpenShadingLanguage_FOUND})
-
 endif()
 
 ###############################################################################
-### Check the C++ version ###
-
-# TODO: Which version starts to impose C++14?
+### Create target
 
-if(${CMAKE_CXX_STANDARD} LESS_EQUAL 11)
-    set(OSL_FOUND OFF)
-    message(WARNING "Need C++14 or higher to compile OpenShadingLanguage. Skipping build of the OSL unit tests")
+if(NOT TARGET osl::osl)
+    add_library(osl::osl INTERFACE IMPORTED GLOBAL)
 endif()
 
 ###############################################################################
 ### Configure target ###
 
 if(OSL_FOUND)
-
-    if (NOT OSL_FIND_QUIETLY)
-        message(STATUS "OpenShadingLanguage includes    = ${OSL_INCLUDE_DIR}")
-        message(STATUS "OpenShadingLanguage shaders     = ${OSL_SHADERS_DIR}")
-        message(STATUS "OpenShadingLanguage library dir = ${OSL_LIB_DIR}")
-    endif ()
-
     list(APPEND LIB_INCLUDE_DIRS ${OSL_INCLUDE_DIR})
     list(APPEND LIB_INCLUDE_DIRS ${OSL_SHADERS_INCLUDE_DIR})
 
     target_include_directories(osl::osl INTERFACE "${LIB_INCLUDE_DIRS}")
     target_link_libraries(osl::osl INTERFACE OSL::oslcomp OSL::oslexec)
 
-    if (${OpenShadingLanguage_VERSION} VERSION_GREATER_EQUAL "1.12" AND ${CMAKE_CXX_STANDARD} LESS_EQUAL 11)
+    # C++14 is required for OSL 1.12+
+    if (${OSL_VERSION} VERSION_GREATER_EQUAL "1.12" AND ${CMAKE_CXX_STANDARD} LESS_EQUAL 11)
         set(OSL_FOUND OFF)
         message(WARNING "Need C++14 or higher to compile OpenShadingLanguage. Skipping build the OSL unit tests")
     endif()
@@ -152,5 +140,4 @@ if(OSL_FOUND)
         oslcomp_LIBRARY oslcomp_FOUND
         oslexec_LIBRARY oslexec_FOUND
     )
-
 endif()
diff --git a/share/cmake/modules/FindOpenImageIO.cmake b/share/cmake/modules/FindOpenImageIO.cmake
deleted file mode 100644
index c8f68abd63..0000000000
--- a/share/cmake/modules/FindOpenImageIO.cmake
+++ /dev/null
@@ -1,174 +0,0 @@
-###########################################################################
-# OpenImageIO   https://www.openimageio.org
-# Copyright 2008-present Contributors to the OpenImageIO project.
-# SPDX-License-Identifier: BSD-3-Clause
-# https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
-#
-# For an up-to-date version of this file, see:
-#   https://github.com/OpenImageIO/oiio/blob/master/src/cmake/Modules/FindOpenImageIO.cmake
-#
-###########################################################################
-#
-# CMake module to find OpenImageIO
-#
-# This module will set
-#   OpenImageIO_FOUND          True, if found
-#   OPENIMAGEIO_INCLUDES       directory where headers are found
-#   OPENIMAGEIO_LIBRARIES      libraries for OIIO
-#   OPENIMAGEIO_LIBRARY_DIRS   library dirs for OIIO
-#   OPENIMAGEIO_VERSION        Version ("major.minor.patch.tweak")
-#   OPENIMAGEIO_VERSION_MAJOR  Version major number
-#   OPENIMAGEIO_VERSION_MINOR  Version minor number
-#   OPENIMAGEIO_VERSION_PATCH  Version minor patch
-#   OPENIMAGEIO_VERSION_TWEAK  Version minor tweak
-#
-# Imported targets:
-#   OpenImageIO::OpenImageIO   The libOpenImageIO library.
-#   OpenImageIO::oiiotool      The oiiotool executable.
-#
-# Special inputs:
-#   OpenImageIO_ROOT - if using CMake >= 3.12, will automatically search
-#                          this area for OIIO components.
-#   OPENIMAGEIO_ROOT_DIR - custom "prefix" location of OIIO installation
-#                          (expecting bin, lib, include subdirectories)
-#                          This is deprecated, but will work for a while.
-#   OpenImageIO_FIND_QUIETLY - if set, print minimal console output
-#   OIIO_LIBNAME_SUFFIX - if set, optional nonstandard library suffix
-#
-###########################################################################
-#
-# NOTE: This file is deprecated.
-#
-# In OIIO 2.1+, we generate OpenImageIOConfig.cmake files that are now the
-# preferred way for downstream projecs to find an installed OIIO. There
-# should be no need to copy this FindOpenImageIO.cmake file into downstream
-# projects, *unless* they need to work with a range of OIIO vesions that
-# may include <2.1, which would lack the generated config files.
-#
-###########################################################################
-
-
-# If 'OPENIMAGE_HOME' not set, use the env variable of that name if available
-if (NOT OPENIMAGEIO_ROOT_DIR AND NOT $ENV{OPENIMAGEIO_ROOT_DIR} STREQUAL "")
-    set (OPENIMAGEIO_ROOT_DIR $ENV{OPENIMAGEIO_ROOT_DIR})
-endif ()
-
-find_library (OPENIMAGEIO_LIBRARY
-    NAMES
-        OpenImageIO${OIIO_LIBNAME_SUFFIX}
-    HINTS
-        ${OPENIMAGEIO_ROOT_DIR}
-    PATH_SUFFIXES
-        lib64
-        lib
-        OpenImageIO/lib
-)
-find_library (OPENIMAGEIO_UTIL_LIBRARY
-    NAMES
-        OpenImageIO_Util${OIIO_LIBNAME_SUFFIX}
-    HINTS
-        ${OPENIMAGEIO_ROOT_DIR}
-    PATH_SUFFIXES
-        lib64
-        lib
-        OpenImageIO/lib
-)
-find_path (OPENIMAGEIO_INCLUDE_DIR
-    NAMES
-        OpenImageIO/imageio.h
-    HINTS
-        ${OPENIMAGEIO_ROOT_DIR}
-    PATH_SUFFIXES
-        OpenImageIO/include
-)
-find_program (OIIOTOOL_BIN
-    NAMES
-        oiiotool
-    HINTS
-        ${OPENIMAGEIO_ROOT_DIR}
-    PATH_SUFFIXES
-        OpenImageIO/bin
-)
-
-# Try to figure out version number
-set (OIIO_VERSION_HEADER "${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO/oiioversion.h")
-if (EXISTS "${OIIO_VERSION_HEADER}")
-    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MAJOR .*$")
-    string (REGEX MATCHALL "[0-9]+" OPENIMAGEIO_VERSION_MAJOR ${TMP})
-    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MINOR .*$")
-    string (REGEX MATCHALL "[0-9]+" OPENIMAGEIO_VERSION_MINOR ${TMP})
-    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_PATCH .*$")
-    string (REGEX MATCHALL "[0-9]+" OPENIMAGEIO_VERSION_PATCH ${TMP})
-    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_TWEAK .*$")
-    if (TMP)
-        string (REGEX MATCHALL "[0-9]+" OPENIMAGEIO_VERSION_TWEAK ${TMP})
-    else ()
-        set (OPENIMAGEIO_VERSION_TWEAK 0)
-    endif ()
-    set (OPENIMAGEIO_VERSION "${OPENIMAGEIO_VERSION_MAJOR}.${OPENIMAGEIO_VERSION_MINOR}.${OPENIMAGEIO_VERSION_PATCH}.${OPENIMAGEIO_VERSION_TWEAK}")
-endif ()
-
-set (OIIO_IMATH_HEADER "${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO/Imath.h")
-if (EXISTS "${OIIO_IMATH_HEADER}")
-    file (STRINGS "${OIIO_IMATH_HEADER}" TMP REGEX "^#define OIIO_USING_IMATH .*$")
-    string (REGEX MATCHALL "[0-9]" OIIO_IMATH_VERSION ${TMP})
-    if (OIIO_IMATH_VERSION LESS 3)
-        message(STATUS "Skipping OpenImageIO built against OpenEXR 2, please use version 3 or greater.")
-        return ()
-    endif ()
-endif ()
-
-include (FindPackageHandleStandardArgs)
-find_package_handle_standard_args (
-    OpenImageIO
-    FOUND_VAR
-        OpenImageIO_FOUND
-    REQUIRED_VARS
-        OPENIMAGEIO_INCLUDE_DIR
-        OPENIMAGEIO_LIBRARY
-        OPENIMAGEIO_VERSION
-    VERSION_VAR
-        OPENIMAGEIO_VERSION
-)
-
-set (OPENIMAGEIO_FOUND ${OpenImageIO_FOUND})  # Old name
-
-if (OpenImageIO_FOUND)
-    set (OPENIMAGEIO_INCLUDES ${OPENIMAGEIO_INCLUDE_DIR})
-    set (OPENIMAGEIO_LIBRARIES ${OPENIMAGEIO_LIBRARY})
-    get_filename_component (OPENIMAGEIO_LIBRARY_DIRS "${OPENIMAGEIO_LIBRARY}" DIRECTORY)
-    if (NOT OpenImageIO_FIND_QUIETLY)
-        message ( STATUS "OpenImageIO includes     = ${OPENIMAGEIO_INCLUDE_DIR}" )
-        message ( STATUS "OpenImageIO libraries    = ${OPENIMAGEIO_LIBRARIES}" )
-        message ( STATUS "OpenImageIO library_dirs = ${OPENIMAGEIO_LIBRARY_DIRS}" )
-    endif ()
-
-    if (NOT TARGET OpenImageIO::OpenImageIO)
-        add_library(OpenImageIO::OpenImageIO UNKNOWN IMPORTED)
-        set_target_properties(OpenImageIO::OpenImageIO PROPERTIES
-            INTERFACE_INCLUDE_DIRECTORIES "${OPENIMAGEIO_INCLUDES}")
-
-        set_property(TARGET OpenImageIO::OpenImageIO APPEND PROPERTY
-            IMPORTED_LOCATION "${OPENIMAGEIO_LIBRARIES}")
-    endif ()
-
-    # Starting with OIIO v2.3, some utility classes are now only declared in OpenImageIO_Util
-    # (and not in both libraries like in older versions).
-    if (${OPENIMAGEIO_VERSION} VERSION_GREATER_EQUAL "2.3" AND NOT TARGET OpenImageIO::OpenImageIO_Util)
-        add_library(OpenImageIO::OpenImageIO_Util UNKNOWN IMPORTED)
-        set_target_properties(OpenImageIO::OpenImageIO_Util PROPERTIES
-            IMPORTED_LOCATION "${OPENIMAGEIO_UTIL_LIBRARY}")
-        target_link_libraries(OpenImageIO::OpenImageIO INTERFACE OpenImageIO::OpenImageIO_Util)
-    endif ()
-
-    # Starting with OIIO v2.3, OIIO needs to compile at least in C++14.
-    if (${OPENIMAGEIO_VERSION} VERSION_GREATER_EQUAL "2.3" AND ${CMAKE_CXX_STANDARD} LESS_EQUAL 11)
-        set(OpenImageIO_FOUND OFF)
-        message(WARNING "Need C++14 or higher to compile with OpenImageIO ${OPENIMAGEIO_VERSION}.")
-    endif ()
-endif ()
-
-mark_as_advanced (
-    OPENIMAGEIO_INCLUDE_DIR
-    OPENIMAGEIO_LIBRARY
-)
diff --git a/share/cmake/modules/FindSphinx.cmake b/share/cmake/modules/FindSphinx.cmake
index adf1080988..9278c57d9a 100644
--- a/share/cmake/modules/FindSphinx.cmake
+++ b/share/cmake/modules/FindSphinx.cmake
@@ -4,14 +4,14 @@
 # Locate Sphinx (Python documentation generator)
 #
 # Variables defined by this module:
-#   Sphinx_FOUND
-#   Sphinx_EXECUTABLE (CACHE)
+#   Sphinx_FOUND             - Indicate whether the executable was found or not
+#   Sphinx_EXECUTABLE        - Path to the executable file
 #
-# Usage:
-#   find_package(Sphinx)
 #
-# If Sphinx is not installed in a standard path, add it to the Sphinx_ROOT 
-# variable to tell CMake where to find it.
+# If the executable is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -DSphinx_ROOT to point to the directory containing the executable.
+# -- Set -DSphinx_EXECUTABLE to point to executable file.
 #
 
 find_package(Python QUIET COMPONENTS Interpreter)
diff --git a/share/cmake/modules/Findexpat.cmake b/share/cmake/modules/Findexpat.cmake
index 8e23cbba0b..1e715bb0cf 100644
--- a/share/cmake/modules/Findexpat.cmake
+++ b/share/cmake/modules/Findexpat.cmake
@@ -1,25 +1,27 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install expat
+# Locate expat
 #
 # Variables defined by this module:
-#   expat_FOUND - If FALSE, do not try to link to expat
-#   expat_LIBRARY - expat library to link to
-#   expat_INCLUDE_DIR - Where to find expat.h
-#   expat_VERSION - The version of the library
+#   expat_FOUND          - Indicate whether the library was found or not
+#   expat_LIBRARY        - Path to the library file
+#   expat_INCLUDE_DIR    - Location of the header files
+#   expat_VERSION        - Library's version
 #
-# Targets defined by this module:
-#   expat::expat - IMPORTED target, if found
+# Global targets defined by this module:
+#   expat::expat
 #
-# By default, the dynamic libraries of expat will be found. To find the static 
-# ones instead, you must set the expat_STATIC_LIBRARY variable to TRUE 
-# before calling find_package(expat ...).
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+# In this case, you may set expat_STATIC_LIBRARY to ON to request use of the static one. 
+# If only the static library is present (such as when OCIO builds the dependency), then the option 
+# is not needed.
 #
-# If expat is not installed in a standard path, you can use the expat_ROOT 
-# variable to tell CMake where to find it. If it is not found and 
-# OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, expat will be downloaded, 
-# built, and statically-linked into libOpenColorIO at build time.
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -Dexpat_DIR to point to the directory containing the CMake configuration file for the package.
+# -- Set -Dexpat_ROOT to point to the directory containing the lib and include directories.
+# -- Set -Dexpat_LIBRARY and -Dexpat_INCLUDE_DIR to point to the lib and include directories.
 #
 
 ###############################################################################
@@ -96,7 +98,6 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
                 expat/include
         )
 
-
         # Expat uses prefix "lib" on all platform by default.
         # Library name doesn't change in debug.
 
@@ -200,126 +201,11 @@ endif()
 ###############################################################################
 ### Create target
 
-if(NOT TARGET expat::expat)
+if(expat_FOUND AND NOT TARGET expat::expat)
     add_library(expat::expat UNKNOWN IMPORTED GLOBAL)
     set(_expat_TARGET_CREATE TRUE)
 endif()
 
-###############################################################################
-### Install package from source ###
-
-if(NOT expat_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-    include(GNUInstallDirs)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(expat_FOUND TRUE)
-    set(expat_VERSION ${expat_FIND_VERSION})
-    set(expat_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
-
-    # Set the expected library name
-    if(WIN32)
-        if(BUILD_TYPE_DEBUG)
-            set(_expat_LIB_SUFFIX "d")
-        endif()
-        # Static Linking, Multi-threaded Dll naming (>=2.2.8):
-        #   https://github.com/libexpat/libexpat/blob/R_2_2_8/expat/win32/README.txt
-        set(_expat_LIB_SUFFIX "${_expat_LIB_SUFFIX}MD")
-    endif()
-
-    # Expat use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
-    # https://github.com/libexpat/libexpat/blob/R_2_4_1/expat/CMakeLists.txt#L374
-    set(_expat_LIB_PREFIX "lib")
-
-    set(expat_LIBRARY
-        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_expat_LIB_PREFIX}expat${_expat_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-    if(_expat_TARGET_CREATE)
-        if(MSVC)
-            set(EXPAT_C_FLAGS "${EXPAT_C_FLAGS} /EHsc")
-            set(EXPAT_CXX_FLAGS "${EXPAT_CXX_FLAGS} /EHsc")
-        endif()
-
-        string(STRIP "${EXPAT_C_FLAGS}" EXPAT_C_FLAGS)
-        string(STRIP "${EXPAT_CXX_FLAGS}" EXPAT_CXX_FLAGS)
-
-        set(EXPAT_CMAKE_ARGS
-            ${EXPAT_CMAKE_ARGS}
-            -DCMAKE_POLICY_DEFAULT_CMP0063=NEW
-            -DCMAKE_C_VISIBILITY_PRESET=${CMAKE_C_VISIBILITY_PRESET}
-            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
-            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_C_FLAGS=${EXPAT_C_FLAGS}
-            -DCMAKE_CXX_FLAGS=${EXPAT_CXX_FLAGS}
-            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-            -DEXPAT_BUILD_DOCS=OFF
-            -DEXPAT_BUILD_EXAMPLES=OFF
-            -DEXPAT_BUILD_TESTS=OFF
-            -DEXPAT_BUILD_TOOLS=OFF
-            -DEXPAT_SHARED_LIBS=OFF
-        )
-
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(EXPAT_CMAKE_ARGS
-                ${EXPAT_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
-
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-            set(EXPAT_CMAKE_ARGS
-                ${EXPAT_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-            )
-        endif()
-
-        if (ANDROID)
-            set(EXPAT_CMAKE_ARGS
-                ${EXPAT_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
-        endif()
-
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${expat_INCLUDE_DIR})
-
-        ExternalProject_Add(expat_install
-            GIT_REPOSITORY "https://github.com/libexpat/libexpat.git"
-            GIT_TAG "R_${expat_FIND_VERSION_MAJOR}_${expat_FIND_VERSION_MINOR}_${expat_FIND_VERSION_PATCH}"
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/libexpat"
-            BUILD_BYPRODUCTS ${expat_LIBRARY}
-            SOURCE_SUBDIR expat
-            CMAKE_ARGS ${EXPAT_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-            BUILD_COMMAND ""
-            INSTALL_COMMAND
-                ${CMAKE_COMMAND} --build .
-                                 --config ${CMAKE_BUILD_TYPE}
-                                 --target install
-                                 --parallel
-        )
-
-        add_dependencies(expat::expat expat_install)
-        message(STATUS "Installing expat: ${expat_LIBRARY} (version \"${expat_VERSION}\")")
-    endif()
-endif()
-
 ###############################################################################
 ### Configure target ###
 
@@ -330,4 +216,4 @@ if(_expat_TARGET_CREATE)
     )
 
     mark_as_advanced(expat_INCLUDE_DIR expat_LIBRARY expat_VERSION)
-endif()
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/Findlcms2.cmake b/share/cmake/modules/Findlcms2.cmake
index f35fcf14f4..8183376330 100644
--- a/share/cmake/modules/Findlcms2.cmake
+++ b/share/cmake/modules/Findlcms2.cmake
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install lcms2
+# Locate lcms2
 #
 # Variables defined by this module:
 #   lcms2_FOUND - If FALSE, do not try to link to lcms
@@ -9,24 +9,20 @@
 #   lcms2_INCLUDE_DIR - Where to find lcms2.h
 #   lcms2_VERSION - The version of the library
 #
-# Targets defined by this module:
+# Global targets defined by this module:
 #   lcms2::lcms2 - IMPORTED target, if found
 #
-# By default, the dynamic libraries of lcms2 will be found. To find the static 
-# ones instead, you must set the lcms2_STATIC_LIBRARY variable to TRUE 
-# before calling find_package(lcms2 ...).
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+# In this case, you may set lcms2_STATIC_LIBRARY to ON to request use of the static one. 
+# If only the static library is present (such as when OCIO builds the dependency), then the option 
+# is not needed.
 #
-# If lcms2 is not installed in a standard path, you can use the lcms2_ROOT 
-# variable to tell CMake where to find it. If it is not found and 
-# OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, lcms2 will be 
-# downloaded, built, and statically-linked into libOpenColorIO at build time.
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -Dlcms2_ROOT to point to the directory containing the lib and include directories.
+# -- Set -Dlcms2_LIBRARY and -Dlcms2_INCLUDE_DIR to point to the lib and include directories.
 #
 
-if(NOT TARGET lcms2::lcms2)
-    add_library(lcms2::lcms2 UNKNOWN IMPORTED GLOBAL)
-    set(_lcms2_TARGET_CREATE TRUE)
-endif()
-
 ###############################################################################
 ### Try to find package ###
 
@@ -95,104 +91,11 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
 endif()
 
 ###############################################################################
-### Install package from source ###
-
-if(NOT lcms2_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(lcms2_FOUND TRUE)
-    set(lcms2_VERSION ${lcms2_FIND_VERSION})
-    set(lcms2_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/lcms2")
-    set(lcms2_LIBRARY 
-        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}lcms2${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-    if(_lcms2_TARGET_CREATE)
-        if(UNIX)
-            set(lcms2_C_FLAGS "${lcms2_C_FLAGS} -fvisibility=hidden -fPIC")
-        endif()
-
-        if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
-            set(lcms2_C_FLAGS "${lcms2_C_FLAGS} -Wno-aggressive-loop-optimizations")
-        endif()
-
-        string(STRIP "${lcms2_C_FLAGS}" lcms2_C_FLAGS)
-
-        # NOTE: Depending of the compiler version lcm2 2.2 does not compile with C++17 so revert 
-        # to C++11 because the library is only used by a cmd line tool.
-
-        set(lcms2_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-        if(${CMAKE_CXX_STANDARD} GREATER_EQUAL 17)
-            set(lcms2_CXX_STANDARD 11)
-        endif()
-
-        set(lcms2_CMAKE_ARGS
-            ${lcms2_CMAKE_ARGS}
-            -DCMAKE_C_FLAGS=${lcms2_C_FLAGS}
-            -DCMAKE_CXX_STANDARD=${lcms2_CXX_STANDARD}
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-            -DBUILD_SHARED_LIBS=OFF
-        )
-
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(lcms2_CMAKE_ARGS
-                ${lcms2_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
-
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-            set(lcms2_CMAKE_ARGS
-                ${lcms2_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-            )
-        endif()
-
-        if (ANDROID)
-            set(lcms2_CMAKE_ARGS
-                ${lcms2_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
-        endif()
-
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${lcms2_INCLUDE_DIR})
-
-        ExternalProject_Add(lcms2_install
-            GIT_REPOSITORY "https://github.com/mm2/Little-CMS.git"
-            GIT_TAG "lcms${lcms2_VERSION}"
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/Little-CMS"
-            BUILD_BYPRODUCTS ${lcms2_LIBRARY}
-            CMAKE_ARGS ${lcms2_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-            PATCH_COMMAND
-                ${CMAKE_COMMAND} -E copy
-                "${CMAKE_SOURCE_DIR}/share/cmake/projects/Buildlcms2.cmake"
-                "CMakeLists.txt"
-            BUILD_COMMAND ""
-            INSTALL_COMMAND
-                ${CMAKE_COMMAND} --build .
-                                 --config ${CMAKE_BUILD_TYPE}
-                                 --target install
-                                 --parallel
-        )
+### Configure target ###
 
-        add_dependencies(lcms2::lcms2 lcms2_install)
-        message(STATUS "Installing lcms2: ${lcms2_LIBRARY} (version \"${lcms2_VERSION}\")")
-    endif()
+if(lcms2_FOUND AND NOT TARGET lcms2::lcms2)
+    add_library(lcms2::lcms2 UNKNOWN IMPORTED GLOBAL)
+    set(_lcms2_TARGET_CREATE TRUE)
 endif()
 
 ###############################################################################
diff --git a/share/cmake/modules/Findminizip-ng.cmake b/share/cmake/modules/Findminizip-ng.cmake
index b961d1c9ac..4d66093839 100644
--- a/share/cmake/modules/Findminizip-ng.cmake
+++ b/share/cmake/modules/Findminizip-ng.cmake
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install minizip-ng
+# Locate minizip-ng
 #
 # Variables defined by this module:
 #   minizip-ng_FOUND        - If FALSE, do not try to link to minizip-ng
@@ -9,13 +9,20 @@
 #   minizip-ng_INCLUDE_DIR  - Where to find mz.h and other headers
 #   minizip-ng_VERSION      - The version of the library
 #
-# Targets defined by this module:
+# Global targets defined by this module:
 #   MINIZIP::minizip-ng - IMPORTED target, if found
 #
-# If minizip-ng is not installed in a standard path, you can use the minizip-ng_ROOT 
-# variable to tell CMake where to find it. If it is not found and 
-# OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, minizip-ng will be downloaded, 
-# built, and statically-linked into libOpenColorIO at build time.
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+# In this case, you may set minizip-ng_STATIC_LIBRARY to ON to request use of the static one. 
+# If only the static library is present (such as when OCIO builds the dependency), then the option 
+# is not needed.
+#
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -Dminizip-ng_DIR to point to the directory containing the CMake configuration file for the package.
+# -- Set -Dminizip-ng_ROOT to point to the directory containing the lib and include directories.
+# -- Set -Dminizip-ng_LIBRARY and -Dminizip-ng_INCLUDE_DIR to point to the lib and include directories.
+#
 #
 # For external builds of minizip-ng, please note that the same build options should be used.
 # Using more options, such as enabling other compression methods, will provoke linking issue
@@ -23,6 +30,7 @@
 #
 # e.g. Setting MZ_BZIP2=ON will cause linking issue since OCIO will not be linked against BZIP2.
 #
+
 ###############################################################################
 ### Try to find package ###
 
@@ -40,7 +48,7 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
         # Search for minizip-ng-config.cmake
         find_package(minizip-ng ${minizip-ng_FIND_VERSION} CONFIG QUIET)
     endif()
-
+    
     if (minizip-ng_FOUND)
         get_target_property(minizip-ng_INCLUDE_DIR MINIZIP::minizip-ng INTERFACE_INCLUDE_DIRECTORIES)
         get_target_property(minizip-ng_LIBRARY MINIZIP::minizip-ng LOCATION)
@@ -176,120 +184,12 @@ endif()
 
 ###############################################################################
 ### Create target
-if(NOT TARGET MINIZIP::minizip-ng)
+
+if(minizip-ng_FOUND AND NOT TARGET MINIZIP::minizip-ng)
     add_library(MINIZIP::minizip-ng UNKNOWN IMPORTED GLOBAL)
     set(_minizip-ng_TARGET_CREATE TRUE)
 endif()
 
-###############################################################################
-### Install package from source ###
-
-if(NOT minizip-ng_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-    include(GNUInstallDirs)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(minizip-ng_FOUND TRUE)
-    set(minizip-ng_VERSION ${minizip-ng_FIND_VERSION})
-
-    set(minizip-ng_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng")
-
-    # Minizip-ng use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
-    set(_minizip-ng_LIB_PREFIX "lib")
-
-    set(minizip-ng_LIBRARY
-        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_minizip-ng_LIB_PREFIX}minizip-ng${_minizip-ng_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-    if(_minizip-ng_TARGET_CREATE)
-        set(minizip-ng_CMAKE_ARGS
-            ${minizip-ng_CMAKE_ARGS}
-            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
-            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            # Since the other modules create a subfolder for the includes by default and since
-            # minizip-ng does not, a suffix is added to CMAKE_INSTALL_INCLUDEDIR in order to
-            # install the headers under a subdirectory named "minizip-ng".
-            # Note that this does not affect external builds for minizip-ng.
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-            -DBUILD_SHARED_LIBS=OFF
-            -DMZ_OPENSSL=OFF
-            -DMZ_LIBBSD=OFF
-            -DMZ_BUILD_TESTS=OFF
-            -DMZ_COMPAT=OFF
-            -DMZ_BZIP2=OFF
-            -DMZ_LZMA=OFF
-            -DMZ_LIBCOMP=OFF
-            -DMZ_ZSTD=OFF
-            -DMZ_PKCRYPT=OFF
-            -DMZ_WZAES=OFF
-            -DMZ_SIGNING=OFF
-            -DMZ_ZLIB=ON
-            -DMZ_ICONV=OFF
-            -DMZ_FETCH_LIBS=OFF
-            -DMZ_FORCE_FETCH_LIBS=OFF
-            -DZLIB_LIBRARY=${ZLIB_LIBRARIES}
-            -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS}
-        )
-
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(minizip-ng_CMAKE_ARGS
-                ${minizip-ng_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
-
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-            set(minizip-ng_CMAKE_ARGS
-                ${minizip-ng_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-            )
-        endif()
-
-        if (ANDROID)
-            set(minizip-ng_CMAKE_ARGS
-                ${minizip-ng_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
-        endif()
-    endif()
-
-    # Hack to let imported target be built from ExternalProject_Add
-    file(MAKE_DIRECTORY ${minizip-ng_INCLUDE_DIR})
-
-    ExternalProject_Add(minizip-ng_install
-        GIT_REPOSITORY "https://github.com/zlib-ng/minizip-ng.git"
-        GIT_TAG "${minizip-ng_VERSION}"
-        GIT_CONFIG advice.detachedHead=false
-        GIT_SHALLOW TRUE
-        PREFIX "${_EXT_BUILD_ROOT}/libminizip-ng"
-        BUILD_BYPRODUCTS ${minizip-ng_LIBRARY}
-        CMAKE_ARGS ${minizip-ng_CMAKE_ARGS}
-        EXCLUDE_FROM_ALL TRUE
-        BUILD_COMMAND ""
-        INSTALL_COMMAND
-            ${CMAKE_COMMAND} --build .
-                            --config ${CMAKE_BUILD_TYPE}
-                            --target install
-                            --parallel
-    )
-
-    add_dependencies(MINIZIP::minizip-ng minizip-ng_install)
-    message(STATUS "Installing minizip-ng: ${minizip-ng_LIBRARY} (version \"${minizip-ng_VERSION}\")")
-endif()
-
 ###############################################################################
 ### Configure target ###
 
diff --git a/share/cmake/modules/Findopenfx.cmake b/share/cmake/modules/Findopenfx.cmake
index 8b0eefb344..40dfac7072 100644
--- a/share/cmake/modules/Findopenfx.cmake
+++ b/share/cmake/modules/Findopenfx.cmake
@@ -1,26 +1,22 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install openfx
+# Locate openfx
 #
 # Variables defined by this module:
-#   openfx_FOUND - If FALSE, do not try to include openfx
-#   openfx_INCLUDE_DIR - Where to find ofxCore.h
-#   openfx_VERSION - The version of the library
+#   openfx_FOUND          - Indicate whether the library was found or not
+#   openfx_INCLUDE_DIR    - Location of the header files
+#   openfx_VERSION        - Library's version
 #
-# Targets defined by this module:
-#   openfx::module - IMPORTED target, if found
+# Global targets defined by this module:
+#   openfx::module
 #
-# If openfx is not installed in a standard path, you can use the 
-# openfx_ROOT variable to tell CMake where to find it. If it is not found 
-# and OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, openfx will be 
-# downloaded at build time.
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+#
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using the following method:
+# -- Set -Dopenfx_ROOT to point to the directory containing the lib and include directories.
 #
-
-if(NOT TARGET openfx::module)
-    add_library(openfx::module INTERFACE IMPORTED GLOBAL)
-    set(_openfx_TARGET_CREATE TRUE)
-endif()
 
 ###############################################################################
 ### Try to find package ###
@@ -56,45 +52,11 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
 endif()
 
 ###############################################################################
-### Install package from source ###
-
-if(NOT openfx_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-    include(GNUInstallDirs)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-    set(_openfx_INSTALL_DIR "${_EXT_BUILD_ROOT}/openfx/src/openfx_install")
+### Create target
 
-    # Set find_package standard args
-    set(openfx_FOUND TRUE)
-    set(openfx_VERSION ${openfx_FIND_VERSION})
-    set(openfx_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/openfx")
-
-    if(_openfx_TARGET_CREATE)
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${openfx_INCLUDE_DIR})
-
-        ExternalProject_Add(openfx_install
-            GIT_REPOSITORY "https://github.com/ofxa/openfx.git"
-            GIT_TAG "OFX_Release_${openfx_FIND_VERSION_MAJOR}_${openfx_FIND_VERSION_MINOR}_TAG"
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/openfx"
-            BUILD_BYPRODUCTS ${openfx_INCLUDE_DIR}
-            CONFIGURE_COMMAND ""
-            BUILD_COMMAND
-                ${CMAKE_COMMAND} -E copy_directory
-                "${_EXT_BUILD_ROOT}/openfx/src/openfx_install/include"
-                "${openfx_INCLUDE_DIR}"
-            INSTALL_COMMAND ""
-            CMAKE_ARGS ${openfx_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-        )
-
-        add_dependencies(openfx::module openfx_install)
-        message(STATUS "Installing openfx: ${openfx_INCLUDE_DIR} (version \"${openfx_VERSION}\")")
-    endif()
+if(openfx_FOUND AND NOT TARGET openfx::module)
+    add_library(openfx::module INTERFACE IMPORTED GLOBAL)
+    set(_openfx_TARGET_CREATE TRUE)
 endif()
 
 ###############################################################################
diff --git a/share/cmake/modules/Findpybind11.cmake b/share/cmake/modules/Findpybind11.cmake
index e6c2f35a86..ccbb53e656 100644
--- a/share/cmake/modules/Findpybind11.cmake
+++ b/share/cmake/modules/Findpybind11.cmake
@@ -1,20 +1,23 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install pybind11
+# Locate pybind11
 #
 # Variables defined by this module:
 #   pybind11_FOUND - If FALSE, do not try to link to pybind11
 #   pybind11_INCLUDE_DIR - Where to find pybind11.h
 #   pybind11_VERSION - The version of the library
 #
-# Targets defined by this module:
-#   pybind11::module - IMPORTED target, if found
+# Global targets defined by this module:
+#   pybind11::module
 #
-# If pybind11 is not installed in a standard path, you can use the 
-# pybind11_ROOT variable to tell CMake where to find it. If it is not found 
-# and OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, pybind11 will be 
-# downloaded at build time.
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+#
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -Dpybind11_DIR to point to the directory containing the CMake configuration file for the package.
+# -- Set -Dpybind11_ROOT to point to the directory containing the lib and include directories.
+# -- Set -Dpybind11_LIBRARY and -Dpybind11_INCLUDE_DIR to point to the lib and include directories.
 #
 
 ###############################################################################
@@ -126,94 +129,11 @@ endif()
 ###############################################################################
 ### Create target ###
 
-if(NOT TARGET pybind11::module)
+if(pybind11_FOUND AND NOT TARGET pybind11::module)
     add_library(pybind11::module INTERFACE IMPORTED GLOBAL)
     set(_pybind11_TARGET_CREATE TRUE)
 endif()
 
-###############################################################################
-### Install package from source ###
-
-if(NOT pybind11_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-    include(GNUInstallDirs)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(pybind11_FOUND TRUE)
-    set(pybind11_VERSION ${pybind11_FIND_VERSION})
-    set(pybind11_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
-
-    if(_pybind11_TARGET_CREATE)
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${pybind11_INCLUDE_DIR})
-
-        set(pybind11_CMAKE_ARGS
-            ${pybind11_CMAKE_ARGS}
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-            # Using FindPython mode (PYBIND11_FINDPYTHON=ON) doesn't seem to
-            # work when building on docker manylinux images where Development
-            # component is not available but is hardcoded in pybind11 script.
-            -DPYTHON_EXECUTABLE=${Python_EXECUTABLE}
-            -DPYBIND11_INSTALL=ON
-            -DPYBIND11_TEST=OFF
-        )
-
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(pybind11_CMAKE_ARGS
-                ${pybind11_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
-
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-            set(pybind11_CMAKE_ARGS
-                ${pybind11_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-            )
-        endif()
-
-        if (ANDROID)
-            set(pybind11_CMAKE_ARGS
-                ${pybind11_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
-        endif()
-
-        ExternalProject_Add(pybind11_install
-            GIT_REPOSITORY "https://github.com/pybind/pybind11.git"
-            GIT_TAG "v${pybind11_FIND_VERSION}"
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/pybind11"
-            BUILD_BYPRODUCTS ${pybind11_INCLUDE_DIR}
-            CMAKE_ARGS ${pybind11_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-            BUILD_COMMAND ""
-            INSTALL_COMMAND
-                ${CMAKE_COMMAND} --build .
-                                 --config ${CMAKE_BUILD_TYPE}
-                                 --target install
-                                 --parallel
-        )
-
-        add_dependencies(pybind11::module pybind11_install)
-        message(STATUS "Installing pybind11: ${pybind11_INCLUDE_DIR} (version \"${pybind11_VERSION}\")")
-    endif()
-endif()
-
 ###############################################################################
 ### Configure target ###
 
diff --git a/share/cmake/modules/Findpystring.cmake b/share/cmake/modules/Findpystring.cmake
index bbd5e1fd27..c466115476 100644
--- a/share/cmake/modules/Findpystring.cmake
+++ b/share/cmake/modules/Findpystring.cmake
@@ -1,26 +1,25 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install pystring
+# Locate pystring
 #
 # Variables defined by this module:
-#   pystring_FOUND - If FALSE, do not try to link to pystring
-#   pystring_LIBRARY - Where to find pystring
-#   pystring_INCLUDE_DIR - Where to find pystring.h
+#   pystring_FOUND          - Indicate whether the library was found or not
+#   pystring_LIBRARY        - Path to the library file
+#   pystring_INCLUDE_DIR    - Location of the header files
+#   pystring_VERSION        - Library's version
 #
-# Targets defined by this module:
+# Global targets defined by this module:
 #   pystring::pystring - IMPORTED target, if found
 #
-# If pystring is not installed in a standard path, you can use the 
-# pystring_ROOT variable to tell CMake where to find it. If it is not found 
-# and OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, pystring will be 
-# downloaded, built, and statically-linked into libOpenColorIO at build time.
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+#
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -Dpystring_ROOT to point to the directory containing the lib and include directories.
+# -- Set -Dpystring_LIBRARY and -Dpystring_INCLUDE_DIR to point to the lib and include directories.
 #
 
-if(NOT TARGET pystring::pystring)
-    add_library(pystring::pystring UNKNOWN IMPORTED GLOBAL)
-    set(_pystring_TARGET_CREATE TRUE)
-endif()
 
 ###############################################################################
 ### Try to find package ###
@@ -64,96 +63,11 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
 endif()
 
 ###############################################################################
-### Install package from source ###
-
-if(NOT pystring_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(pystring_FOUND TRUE)
-    set(pystring_VERSION ${pystring_FIND_VERSION})
-    set(pystring_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
-
-    set(pystring_LIBRARY 
-        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}pystring${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-    if(_pystring_TARGET_CREATE)
-        if(MSVC)
-            set(pystring_CXX_FLAGS "${pystring_CXX_FLAGS} /EHsc")
-        endif()
-
-        string(STRIP "${pystring_CXX_FLAGS}" pystring_CXX_FLAGS)
-
-        set(pystring_CMAKE_ARGS
-            ${pystring_CMAKE_ARGS}
-            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
-            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_CXX_FLAGS=${pystring_CXX_FLAGS}
-            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-        )
-
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(pystring_CMAKE_ARGS
-                ${pystring_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
-
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-            set(pystring_CMAKE_ARGS
-                ${pystring_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-            )
-        endif()
-
-
-        if (ANDROID)
-            set(pystring_CMAKE_ARGS
-                ${pystring_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
-        endif()
-
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${pystring_INCLUDE_DIR})
-
-        ExternalProject_Add(pystring_install
-            GIT_REPOSITORY "https://github.com/imageworks/pystring.git"
-            GIT_TAG "v${pystring_FIND_VERSION}"
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/pystring"
-            BUILD_BYPRODUCTS ${pystring_LIBRARY}
-            CMAKE_ARGS ${pystring_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-            PATCH_COMMAND
-                ${CMAKE_COMMAND} -E copy
-                "${CMAKE_SOURCE_DIR}/share/cmake/projects/Buildpystring.cmake"
-                "CMakeLists.txt"
-            BUILD_COMMAND ""
-            INSTALL_COMMAND
-                ${CMAKE_COMMAND} --build .
-                                 --config ${CMAKE_BUILD_TYPE}
-                                 --target install
-                                 --parallel
-        )
+### Configure target ###
 
-        add_dependencies(pystring::pystring pystring_install)
-        message(STATUS "Installing pystring: ${pystring_LIBRARY} (version \"${pystring_VERSION}\")")
-    endif()
+if(pystring_FOUND AND NOT TARGET pystring::pystring)
+    add_library(pystring::pystring UNKNOWN IMPORTED GLOBAL)
+    set(_pystring_TARGET_CREATE TRUE)
 endif()
 
 ###############################################################################
@@ -166,4 +80,4 @@ if(_pystring_TARGET_CREATE)
     )
 
     mark_as_advanced(pystring_INCLUDE_DIR pystring_LIBRARY pystring_VERSION)
-endif()
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/Findyaml-cpp.cmake b/share/cmake/modules/Findyaml-cpp.cmake
index 59558ce6a9..55e25e2c84 100644
--- a/share/cmake/modules/Findyaml-cpp.cmake
+++ b/share/cmake/modules/Findyaml-cpp.cmake
@@ -1,25 +1,27 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install yaml-cpp
+# Locate yaml-cpp
 #
 # Variables defined by this module:
-#   yaml-cpp_FOUND - If FALSE, do not try to link to yamlcpp
-#   yaml-cpp_LIBRARY - yaml-cpp library to link to
-#   yaml-cpp_INCLUDE_DIR - Where to find yaml.h
-#   yaml-cpp_VERSION - The version of the library
+#   yaml-cpp_FOUND          - Indicate whether the library was found or not
+#   yaml-cpp_LIBRARY        - Path to the library file
+#   yaml-cpp_INCLUDE_DIR    - Location of the header files
+#   yaml-cpp_VERSION        - Library's version
 #
-# Targets defined by this module:
-#   yaml-cpp - IMPORTED target, if found
+# Global targets defined by this module:
+#   yaml-cpp
 #
-# By default, the dynamic libraries of yaml-cpp will be found. To find the 
-# static ones instead, you must set the yaml-cpp_STATIC_LIBRARY variable to 
-# TRUE before calling find_package(yaml-cpp ...).
+# Usually CMake will use the dynamic library rather than static, if both are present. 
+# In this case, you may set yaml-cpp_STATIC_LIBRARY to ON to request use of the static one. 
+# If only the static library is present (such as when OCIO builds the dependency), then the option 
+# is not needed.
 #
-# If yaml-cpp is not installed in a standard path, you can use the 
-# yaml-cpp_ROOT variable to tell CMake where to find it. If it is not found 
-# and OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, yaml-cpp will be 
-# downloaded, built, and statically-linked into libOpenColorIO at build time.
+# If the library is not installed in a typical location where CMake will find it, you may specify 
+# the location using one of the following methods:
+# -- Set -Dyaml-cpp_DIR to point to the directory containing the CMake configuration file for the package.
+# -- Set -Dyaml-cpp_ROOT to point to the directory containing the lib and include directories.
+# -- Set -Dyaml-cpp_LIBRARY and -Dyaml-cpp_INCLUDE_DIR to point to the lib and include directories.
 #
 
 ###############################################################################
@@ -128,128 +130,13 @@ if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
 endif()
 
 ###############################################################################
-### Create target (if previous 'find_package' call hasn't) ###
+### Create target
 
-if(NOT TARGET yaml-cpp)
+if(yaml-cpp_FOUND AND NOT TARGET yaml-cpp)
     add_library(yaml-cpp UNKNOWN IMPORTED GLOBAL)
     set(_yaml-cpp_TARGET_CREATE TRUE)
 endif()
 
-###############################################################################
-### Install package from source ###
-
-if(NOT yaml-cpp_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
-    include(ExternalProject)
-    include(GNUInstallDirs)
-
-    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
-    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
-
-    # Set find_package standard args
-    set(yaml-cpp_FOUND TRUE)
-    set(yaml-cpp_VERSION ${yaml-cpp_FIND_VERSION})
-    set(yaml-cpp_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
-
-    # Starting from 0.7.0, this is included on all platforms, we could also
-    # override CMAKE_DEBUG_POSTFIX to bypass it.
-    if(BUILD_TYPE_DEBUG)
-        string(APPEND _yaml-cpp_LIB_SUFFIX "d")
-    endif()
-
-    # Set the expected library name
-    set(yaml-cpp_LIBRARY
-        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}yaml-cpp${_yaml-cpp_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-    if(_yaml-cpp_TARGET_CREATE)
-        if(MSVC)
-            set(yaml-cpp_CXX_FLAGS "${yaml-cpp_CXX_FLAGS} /EHsc")
-        endif()
-
-        if(UNIX)
-            if(USE_CLANG)
-                # Remove some global 'shadow' warnings.
-                set(yaml-cpp_CXX_FLAGS "${yaml-cpp_CXX_FLAGS} -Wno-shadow")
-            endif()
-        endif()
-
-        string(STRIP "${yaml-cpp_CXX_FLAGS}" yaml-cpp_CXX_FLAGS)
-
-        set(yaml-cpp_CMAKE_ARGS
-            ${yaml-cpp_CMAKE_ARGS}
-            -DCMAKE_POLICY_DEFAULT_CMP0063=NEW
-            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
-            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-            -DCMAKE_CXX_FLAGS=${yaml-cpp_CXX_FLAGS}
-            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
-            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
-            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
-            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
-            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
-            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
-            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
-            -DBUILD_SHARED_LIBS=OFF
-            -DYAML_BUILD_SHARED_LIBS=OFF
-            -DYAML_CPP_BUILD_TESTS=OFF
-            -DYAML_CPP_BUILD_TOOLS=OFF
-            -DYAML_CPP_BUILD_CONTRIB=OFF
-        )
-
-        if(CMAKE_TOOLCHAIN_FILE)
-            set(yaml-cpp_CMAKE_ARGS
-                ${yaml-cpp_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
-        endif()
-
-        if(APPLE)
-            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-
-            set(yaml-cpp_CMAKE_ARGS
-                ${yaml-cpp_CMAKE_ARGS}
-                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
-                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
-            )
-        endif()
-
-
-        if (ANDROID)
-            set(yaml-cpp_CMAKE_ARGS
-                ${yaml-cpp_CMAKE_ARGS}
-                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
-                -DANDROID_ABI=${ANDROID_ABI}
-                -DANDROID_STL=${ANDROID_STL})
-        endif()
-
-        set(yaml-cpp_GIT_TAG "yaml-cpp-${yaml-cpp_VERSION}")
-
-        # Hack to let imported target be built from ExternalProject_Add
-        file(MAKE_DIRECTORY ${yaml-cpp_INCLUDE_DIR})
-
-        ExternalProject_Add(yaml-cpp_install
-            GIT_REPOSITORY "https://github.com/jbeder/yaml-cpp.git"
-            GIT_TAG ${yaml-cpp_GIT_TAG}
-            GIT_CONFIG advice.detachedHead=false
-            GIT_SHALLOW TRUE
-            PREFIX "${_EXT_BUILD_ROOT}/yaml-cpp"
-            BUILD_BYPRODUCTS ${yaml-cpp_LIBRARY}
-            CMAKE_ARGS ${yaml-cpp_CMAKE_ARGS}
-            EXCLUDE_FROM_ALL TRUE
-            BUILD_COMMAND ""
-            INSTALL_COMMAND
-                ${CMAKE_COMMAND} --build .
-                                 --config ${CMAKE_BUILD_TYPE}
-                                 --target install
-                                 --parallel
-        )
-
-        add_dependencies(yaml-cpp yaml-cpp_install)
-        message(STATUS
-            "Installing yaml-cpp: ${yaml-cpp_LIBRARY} (version \"${yaml-cpp_VERSION}\")"
-        )
-    endif()
-endif()
-
 ###############################################################################
 ### Configure target ###
 
@@ -260,4 +147,4 @@ if(_yaml-cpp_TARGET_CREATE)
     )
 
     mark_as_advanced(yaml-cpp_INCLUDE_DIR yaml-cpp_LIBRARY yaml-cpp_VERSION)
-endif()
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/InstallImath.cmake b/share/cmake/modules/install/InstallImath.cmake
new file mode 100644
index 0000000000..4f11d7d0a9
--- /dev/null
+++ b/share/cmake/modules/install/InstallImath.cmake
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install Imath
+#
+# Variables defined by this module:
+#   Imath_FOUND          - Indicate whether the library was found or not
+#   Imath_LIBRARY        - Path to the library file
+#   Imath_INCLUDE_DIR    - Location of the header files
+#   Imath_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   Imath::Imath
+#   Imath::ImathConfig     
+#
+
+
+###############################################################################
+### Create target
+
+if (NOT TARGET Imath::Imath)
+    add_library(Imath::Imath UNKNOWN IMPORTED GLOBAL)
+    add_library(Imath::ImathConfig INTERFACE IMPORTED GLOBAL)
+    set(_Imath_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT Imath_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+    include(GNUInstallDirs)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(Imath_FOUND TRUE)
+    if(OCIO_Imath_RECOMMENDED_VERSION)
+        set(Imath_VERSION ${OCIO_Imath_RECOMMENDED_VERSION})
+    else()
+        set(Imath_VERSION ${Imath_FIND_VERSION})
+    endif()
+    set(Imath_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
+
+    # Set the expected library name
+    if(BUILD_TYPE_DEBUG)
+        set(_Imath_LIB_SUFFIX "_d")
+    endif()
+
+    include(VersionUtils)
+    split_version_string(${Imath_VERSION} _Imath_ExternalProject)
+
+    set(_Imath_LIB_VER "${_Imath_ExternalProject_VERSION_MAJOR}_${_Imath_ExternalProject_VERSION_MINOR}")
+
+    set(Imath_LIBRARY
+        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}Imath-${_Imath_LIB_VER}${_Imath_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    if(_Imath_TARGET_CREATE)
+        if(MSVC)
+            set(Imath_CXX_FLAGS "${Imath_CXX_FLAGS} /EHsc")
+        endif()
+
+        string(STRIP "${Imath_CXX_FLAGS}" Imath_CXX_FLAGS)
+ 
+        set(Imath_CMAKE_ARGS
+            ${Imath_CMAKE_ARGS}
+            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
+            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_CXX_FLAGS=${Imath_CXX_FLAGS}
+            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+            -DBUILD_SHARED_LIBS=OFF
+            -DBUILD_TESTING=OFF
+            -DPYTHON=OFF
+            -DDOCS=OFF
+            -DIMATH_HALF_USE_LOOKUP_TABLE=OFF
+        )
+
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(Imath_CMAKE_ARGS
+                ${Imath_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
+
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+            set(Imath_CMAKE_ARGS
+                ${Imath_CMAKE_ARGS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+            )
+        endif()
+
+        if (ANDROID)
+            set(Imath_CMAKE_ARGS
+                ${Imath_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
+        endif()
+
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${Imath_INCLUDE_DIR})
+
+        ExternalProject_Add(imath_install
+            GIT_REPOSITORY "https://github.com/AcademySoftwareFoundation/Imath.git"
+            GIT_TAG "v${Imath_VERSION}"
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/Imath"
+            BUILD_BYPRODUCTS ${Imath_LIBRARY}
+            CMAKE_ARGS ${Imath_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+            BUILD_COMMAND ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} --build .
+                                 --config ${CMAKE_BUILD_TYPE}
+                                 --target install
+                                 --parallel
+        )
+
+        add_dependencies(Imath::Imath imath_install)
+
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing Imath: ${Imath_LIBRARY} (version \"${Imath_VERSION}\")")
+        endif()
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_Imath_TARGET_CREATE)
+    file(MAKE_DIRECTORY ${Imath_INCLUDE_DIR}/Imath)
+
+    set_target_properties(Imath::Imath PROPERTIES
+        IMPORTED_LOCATION ${Imath_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES "${Imath_INCLUDE_DIR};${Imath_INCLUDE_DIR}/Imath"
+    )
+    set_target_properties(Imath::ImathConfig PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${Imath_INCLUDE_DIR};${Imath_INCLUDE_DIR}/Imath"
+    )
+
+    mark_as_advanced(Imath_INCLUDE_DIR Imath_LIBRARY Imath_VERSION)
+endif()
diff --git a/share/cmake/modules/FindOpenEXR.cmake b/share/cmake/modules/install/InstallOpenEXR.cmake
similarity index 75%
rename from share/cmake/modules/FindOpenEXR.cmake
rename to share/cmake/modules/install/InstallOpenEXR.cmake
index 426305dab4..419fb4aa59 100644
--- a/share/cmake/modules/FindOpenEXR.cmake
+++ b/share/cmake/modules/install/InstallOpenEXR.cmake
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 #
-# Locate or install OpenEXR.
+# Install OpenEXR
 #
 # Variables defined by this module:
-#   OpenEXR_FOUND - If FALSE, do not try to link to OpenEXR
-#   OpenEXR_LIBRARY - OpenEXR library to link to
-#   OpenEXR_INCLUDE_DIR - Where to find OpenEXR headers
-#   OpenEXR_VERSION - The version of the library
+#   OpenEXR_FOUND          - Indicate whether the library was found or not
+#   OpenEXR_LIBRARY        - Path to the library file
+#   OpenEXR_INCLUDE_DIR    - Location of the header files
+#   OpenEXR_VERSION        - Library's version
 #
 # Imported targets defined by this module, if found:
 #   OpenEXR::Iex
@@ -19,51 +19,10 @@
 #   OpenEXR::OpenEXRCore
 #   OpenEXR::OpenEXRUtil
 #
-# By default, the dynamic libraries of OpenEXR will be found. To find the
-# static ones instead, you must set the OpenEXR_STATIC_LIBRARY variable to
-# TRUE before calling find_package(OpenEXR ...).
+# Depending on user options when configuring OCIO, OpenEXR can be used to
+# build libOpenColorIOimageioapphelpers.
 #
-# If OpenEXR is not installed in a standard path, you can use the
-# OpenEXR_ROOT variable to tell CMake where to find it. If it is not found
-# and OCIO_INSTALL_EXT_PACKAGES is set to MISSING or ALL, OpenEXR will be
-# downloaded, statically-built, and used to generate
-# libOpenColorIOimageioapphelpers.
-#
-
-###############################################################################
-### Try to find package ###
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-    set(_OpenEXR_REQUIRED_VARS OpenEXR_LIBRARY)
-    set(_OpenEXR_LIB_VER "${OpenEXR_FIND_VERSION_MAJOR}_${OpenEXR_FIND_VERSION_MINOR}")
 
-    if(NOT DEFINED OpenEXR_ROOT)
-        # Search for OpenEXRConfig.cmake
-        find_package(OpenEXR ${OpenEXR_FIND_VERSION} CONFIG QUIET)
-    endif()
-
-    if(OpenEXR_FOUND)
-        get_target_property(OpenEXR_LIBRARY OpenEXR::OpenEXR LOCATION)
-        get_target_property(OpenEXR_INCLUDE_DIR OpenEXR::OpenEXR INTERFACE_INCLUDE_DIRECTORIES)
-        
-        # IMPORTED_GLOBAL property must be set to TRUE since alisasing a non-global imported target
-        # is not possible until CMake 3.18+.
-        set_target_properties(OpenEXR::OpenEXR PROPERTIES IMPORTED_GLOBAL TRUE)
-    endif()
-
-    # Override REQUIRED if package can be installed
-    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
-        set(OpenEXR_FIND_REQUIRED FALSE)
-    endif()
-
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(OpenEXR
-        REQUIRED_VARS
-            ${_OpenEXR_REQUIRED_VARS}
-        VERSION_VAR
-            OpenEXR_VERSION
-    )
-endif()
 
 ###############################################################################
 ### Create target
@@ -89,15 +48,13 @@ if(NOT OpenEXR_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACK
     set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
 
     # Required dependency
+    # Not trying to find ZLIB here since it is a required dependency of OCIO.
     if(NOT ZLIB_FOUND)
-        find_package(ZLIB)
-        if(NOT ZLIB_FOUND)
-            message(STATUS "ZLib is required to build OpenEXR.")
-            return()
-        endif()
+        message(STATUS "ZLIB is required to build OpenEXR.")
+        return()
     endif()
     
-    find_package(Threads)
+    ocio_handle_dependency(Threads)
     if(NOT Threads_FOUND)
         message(STATUS "Threads is required to build OpenEXR.")
         return()
@@ -105,8 +62,8 @@ if(NOT OpenEXR_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACK
 
     # Set find_package standard args
     set(OpenEXR_FOUND TRUE)
-    if(_OpenEXR_ExternalProject_VERSION)
-        set(OpenEXR_VERSION ${_OpenEXR_ExternalProject_VERSION})
+    if(OCIO_OpenEXR_RECOMMENDED_VERSION)
+        set(OpenEXR_VERSION ${OCIO_OpenEXR_RECOMMENDED_VERSION})
     else()
         set(OpenEXR_VERSION ${OpenEXR_FIND_VERSION})
     endif()
@@ -118,9 +75,9 @@ if(NOT OpenEXR_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACK
     endif()
 
     include(VersionUtils)
-    split_version_string(${OpenEXR_VERSION} _OpenEXR_ExternalProject)
+    split_version_string(${OpenEXR_VERSION} _OpenEXR)
 
-    set(_OpenEXR_LIB_VER "${_OpenEXR_ExternalProject_VERSION_MAJOR}_${_OpenEXR_ExternalProject_VERSION_MINOR}")
+    set(_OpenEXR_LIB_VER "${_OpenEXR_VERSION_MAJOR}_${_OpenEXR_VERSION_MINOR}")
 
     set_target_location(Iex)
     set_target_location(IlmThread)
@@ -220,7 +177,9 @@ if(NOT OpenEXR_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACK
 
         add_dependencies(OpenEXR::OpenEXR openexr_install)
 
-        message(STATUS "Installing OpenEXR: ${OpenEXR_LIBRARY} (version \"${OpenEXR_VERSION}\")")
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing OpenEXR: ${OpenEXR_LIBRARY} (version \"${OpenEXR_VERSION}\")")
+        endif()
     endif()
 endif()
 
diff --git a/share/cmake/modules/InstallZLIB.cmake b/share/cmake/modules/install/InstallZLIB.cmake
similarity index 78%
rename from share/cmake/modules/InstallZLIB.cmake
rename to share/cmake/modules/install/InstallZLIB.cmake
index 954563ec4e..6aa3261fb9 100644
--- a/share/cmake/modules/InstallZLIB.cmake
+++ b/share/cmake/modules/install/InstallZLIB.cmake
@@ -7,17 +7,15 @@
 # CMake's FindZLIB when installing ZLIB manually.
 #
 # Variables defined by this module:
-#   ZLIB_FOUND          - If FALSE, do not try to link to zlib
-#   ZLIB_LIBRARIES      - ZLIB library to link to
-#   ZLIB_INCLUDE_DIRS   - Where to find zlib.h and other headers
-#   ZLIB_VERSION        - The version of the library
+#   ZLIB_FOUND          - Indicate whether the library was found or not
+#   ZLIB_LIBRARY        - Path to the library file
+#   ZLIB_INCLUDE_DIR    - Location of the header files
+#   ZLIB_VERSION        - Library's version
 #
-# Targets defined by this module:
-#   ZLIB::ZLIB          - Properties:
-#                         IMPORTED_LOCATION ${ZLIB_LIBRARIES}
-#                         INTERFACE_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIRS}
+# Global targets defined by this module:
+#   ZLIB::ZLIB
 #
-###############################################################################
+
 
 ###############################################################################
 ### Create target
@@ -47,8 +45,8 @@ if(NOT ZLIB_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGE
 
     # Set find_package standard args
     set(ZLIB_FOUND TRUE)
-    if(_ZLIB_ExternalProject_VERSION)
-        set(ZLIB_VERSION ${_ZLIB_ExternalProject_VERSION})
+    if(OCIO_ZLIB_RECOMMENDED_VERSION)
+        set(ZLIB_VERSION ${OCIO_ZLIB_RECOMMENDED_VERSION})
     else()
         set(ZLIB_VERSION ${ZLIB_FIND_VERSION})
     endif()
@@ -70,6 +68,10 @@ if(NOT ZLIB_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGE
     if(_ZLIB_TARGET_CREATE)
         set(ZLIB_CMAKE_ARGS
             ${ZLIB_CMAKE_ARGS}
+            # Setting policy CMP0042 when building ZLIB since that project is using an old CMake 
+            # version as the cmake_minimum_required and that version has no knowledge of the policy.
+            # Since that policy gets unset, it causes a warning with CMake 3.25+.
+            -DCMAKE_POLICY_DEFAULT_CMP0042=NEW
             -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
             -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
@@ -134,11 +136,17 @@ if(NOT ZLIB_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGE
 
     add_dependencies(ZLIB::ZLIB ZLIB_install)
     
-    # Setting those variables to follow the same naming as the other OCIO custom find modules.
-    set(ZLIB_LIBRARY ${ZLIB_LIBRARIES})
-    set(ZLIB_INCLUDE_DIR ${ZLIB_INCLUDE_DIRS})
-
-    message(STATUS "Installing ZLIB: ${ZLIB_LIBRARIES} (version \"${ZLIB_VERSION}\")")
+    # FindZLIB from CMake needs ZLIB_LIBRARY and ZLIB_INCLUDE_DIR.
+    # Starting CMake 3.17+, The mark_as_advanced() command no longer creates a cache entry if one 
+    # does not already exist. (See policy CMP0102) 
+    # For that reasons, ZLIB_LIBRARY and ZLIB_INCLUDE_DIR are set in the CMake's Cache since ZLIB 
+    # is needed by other OCIO dependencies.
+    set(ZLIB_LIBRARY ${ZLIB_LIBRARIES} CACHE STRING "ZLIB library file")
+    set(ZLIB_INCLUDE_DIR ${ZLIB_INCLUDE_DIRS} CACHE STRING "ZLIB includes directory")
+    
+    if(OCIO_VERBOSE)
+        message(STATUS "Installing ZLIB: ${ZLIB_LIBRARIES} (version \"${ZLIB_VERSION}\") ")
+    endif()
 endif()
 
 
diff --git a/share/cmake/modules/install/Installexpat.cmake b/share/cmake/modules/install/Installexpat.cmake
new file mode 100644
index 0000000000..3908d60cc9
--- /dev/null
+++ b/share/cmake/modules/install/Installexpat.cmake
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install expat
+#
+# Variables defined by this module:
+#   expat_FOUND          - Indicate whether the library was found or not
+#   expat_LIBRARY        - Path to the library file
+#   expat_INCLUDE_DIR    - Location of the header files
+#   expat_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   expat::expat         
+#
+
+
+###############################################################################
+### Create target
+
+if(NOT TARGET expat::expat)
+    add_library(expat::expat UNKNOWN IMPORTED GLOBAL)
+    set(_expat_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT expat_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+    include(GNUInstallDirs)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(expat_FOUND TRUE)
+    if(OCIO_expat_RECOMMENDED_VERSION)
+        set(expat_VERSION ${OCIO_expat_RECOMMENDED_VERSION})
+    else()
+        set(expat_VERSION ${expat_FIND_VERSION})
+    endif()
+    set(expat_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
+
+    # Set the expected library name
+    if(WIN32)
+        if(BUILD_TYPE_DEBUG)
+            set(_expat_LIB_SUFFIX "d")
+        endif()
+        # Static Linking, Multi-threaded Dll naming (>=2.2.8):
+        #   https://github.com/libexpat/libexpat/blob/R_2_2_8/expat/win32/README.txt
+        set(_expat_LIB_SUFFIX "${_expat_LIB_SUFFIX}MD")
+    endif()
+
+    # Expat use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
+    # https://github.com/libexpat/libexpat/blob/R_2_4_1/expat/CMakeLists.txt#L374
+    set(_expat_LIB_PREFIX "lib")
+
+    set(expat_LIBRARY
+        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_expat_LIB_PREFIX}expat${_expat_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    if(_expat_TARGET_CREATE)
+        if(MSVC)
+            set(EXPAT_C_FLAGS "${EXPAT_C_FLAGS} /EHsc")
+            set(EXPAT_CXX_FLAGS "${EXPAT_CXX_FLAGS} /EHsc")
+        endif()
+
+        string(STRIP "${EXPAT_C_FLAGS}" EXPAT_C_FLAGS)
+        string(STRIP "${EXPAT_CXX_FLAGS}" EXPAT_CXX_FLAGS)
+
+        set(EXPAT_CMAKE_ARGS
+            ${EXPAT_CMAKE_ARGS}
+            -DCMAKE_POLICY_DEFAULT_CMP0063=NEW
+            -DCMAKE_C_VISIBILITY_PRESET=${CMAKE_C_VISIBILITY_PRESET}
+            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
+            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_C_FLAGS=${EXPAT_C_FLAGS}
+            -DCMAKE_CXX_FLAGS=${EXPAT_CXX_FLAGS}
+            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+            -DEXPAT_BUILD_DOCS=OFF
+            -DEXPAT_BUILD_EXAMPLES=OFF
+            -DEXPAT_BUILD_TESTS=OFF
+            -DEXPAT_BUILD_TOOLS=OFF
+            -DEXPAT_SHARED_LIBS=OFF
+        )
+
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(EXPAT_CMAKE_ARGS
+                ${EXPAT_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
+
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+            set(EXPAT_CMAKE_ARGS
+                ${EXPAT_CMAKE_ARGS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+            )
+        endif()
+
+        if (ANDROID)
+            set(EXPAT_CMAKE_ARGS
+                ${EXPAT_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
+        endif()
+
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${expat_INCLUDE_DIR})
+
+        string(REPLACE "." ";" VERSION_LIST ${expat_VERSION})
+        list(GET VERSION_LIST 0 expat_VERSION_MAJOR)
+        list(GET VERSION_LIST 1 expat_VERSION_MINOR)
+        list(GET VERSION_LIST 2 expat_VERSION_PATCH)
+
+        ExternalProject_Add(expat_install
+            GIT_REPOSITORY "https://github.com/libexpat/libexpat.git"
+            GIT_TAG "R_${expat_VERSION_MAJOR}_${expat_VERSION_MINOR}_${expat_VERSION_PATCH}"
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/libexpat"
+            BUILD_BYPRODUCTS ${expat_LIBRARY}
+            SOURCE_SUBDIR expat
+            CMAKE_ARGS ${EXPAT_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+            BUILD_COMMAND ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} --build .
+                                 --config ${CMAKE_BUILD_TYPE}
+                                 --target install
+                                 --parallel
+        )
+
+        add_dependencies(expat::expat expat_install)
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing expat: ${expat_LIBRARY} (version \"${expat_VERSION}\")")
+        endif()
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_expat_TARGET_CREATE)
+    set_target_properties(expat::expat PROPERTIES
+        IMPORTED_LOCATION ${expat_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES ${expat_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(expat_INCLUDE_DIR expat_LIBRARY expat_VERSION)
+endif()
diff --git a/share/cmake/modules/install/Installlcms2.cmake b/share/cmake/modules/install/Installlcms2.cmake
new file mode 100644
index 0000000000..94303df7a5
--- /dev/null
+++ b/share/cmake/modules/install/Installlcms2.cmake
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install lcms2
+#
+# Variables defined by this module:
+#   lcms2_FOUND          - Indicate whether the library was found or not
+#   lcms2_LIBRARY        - Path to the library file
+#   lcms2_INCLUDE_DIR    - Location of the header files
+#   lcms2_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   lcms2::lcms2  
+#
+
+
+###############################################################################
+### Configure target ###
+
+if(NOT TARGET lcms2::lcms2)
+    add_library(lcms2::lcms2 UNKNOWN IMPORTED GLOBAL)
+    set(_lcms2_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT lcms2_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(lcms2_FOUND TRUE)
+    if(OCIO_lcms2_RECOMMENDED_VERSION)
+        set(lcms2_VERSION ${OCIO_lcms2_RECOMMENDED_VERSION})
+    else()
+        set(lcms2_VERSION ${lcms2_FIND_VERSION})
+    endif()
+    set(lcms2_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/lcms2")
+    set(lcms2_LIBRARY 
+        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}lcms2${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    if(_lcms2_TARGET_CREATE)
+        if(UNIX)
+            set(lcms2_C_FLAGS "${lcms2_C_FLAGS} -fvisibility=hidden -fPIC")
+        endif()
+
+        if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+            set(lcms2_C_FLAGS "${lcms2_C_FLAGS} -Wno-aggressive-loop-optimizations")
+        endif()
+
+        string(STRIP "${lcms2_C_FLAGS}" lcms2_C_FLAGS)
+
+        # NOTE: Depending of the compiler version lcm2 2.2 does not compile with C++17 so revert 
+        # to C++11 because the library is only used by a cmd line tool.
+
+        set(lcms2_CXX_STANDARD ${CMAKE_CXX_STANDARD})
+        if(${CMAKE_CXX_STANDARD} GREATER_EQUAL 17)
+            set(lcms2_CXX_STANDARD 11)
+        endif()
+
+        set(lcms2_CMAKE_ARGS
+            ${lcms2_CMAKE_ARGS}
+            -DCMAKE_C_FLAGS=${lcms2_C_FLAGS}
+            -DCMAKE_CXX_STANDARD=${lcms2_CXX_STANDARD}
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+            -DBUILD_SHARED_LIBS=OFF
+        )
+
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(lcms2_CMAKE_ARGS
+                ${lcms2_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
+
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+            set(lcms2_CMAKE_ARGS
+                ${lcms2_CMAKE_ARGS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+            )
+        endif()
+
+        if (ANDROID)
+            set(lcms2_CMAKE_ARGS
+                ${lcms2_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
+        endif()
+
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${lcms2_INCLUDE_DIR})
+
+        ExternalProject_Add(lcms2_install
+            GIT_REPOSITORY "https://github.com/mm2/Little-CMS.git"
+            GIT_TAG "lcms${lcms2_VERSION}"
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/Little-CMS"
+            BUILD_BYPRODUCTS ${lcms2_LIBRARY}
+            CMAKE_ARGS ${lcms2_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+            PATCH_COMMAND
+                ${CMAKE_COMMAND} -E copy
+                "${CMAKE_SOURCE_DIR}/share/cmake/projects/Buildlcms2.cmake"
+                "CMakeLists.txt"
+            BUILD_COMMAND ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} --build .
+                                 --config ${CMAKE_BUILD_TYPE}
+                                 --target install
+                                 --parallel
+        )
+
+        add_dependencies(lcms2::lcms2 lcms2_install)
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing lcms2: ${lcms2_LIBRARY} (version \"${lcms2_VERSION}\")")
+        endif()
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_lcms2_TARGET_CREATE)
+    set_target_properties(lcms2::lcms2 PROPERTIES
+        IMPORTED_LOCATION ${lcms2_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES ${lcms2_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(lcms2_INCLUDE_DIR lcms2_LIBRARY lcms2_VERSION)
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installminizip-ng.cmake b/share/cmake/modules/install/Installminizip-ng.cmake
new file mode 100644
index 0000000000..6cbc1e39bd
--- /dev/null
+++ b/share/cmake/modules/install/Installminizip-ng.cmake
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install minizip-ng
+#
+# Variables defined by this module:
+#   minizip-ng_FOUND          - Indicate whether the library was found or not
+#   minizip-ng_LIBRARY        - Path to the library file
+#   minizip-ng_INCLUDE_DIR    - Location of the header files
+#   minizip-ng_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   minizip-ng::minizip-ng
+#
+# Link against the following target:
+#   ZLIB::ZLIB
+#
+
+
+###############################################################################
+### Create target
+if(NOT TARGET MINIZIP::minizip-ng)
+    add_library(MINIZIP::minizip-ng UNKNOWN IMPORTED GLOBAL)
+    set(_minizip-ng_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT minizip-ng_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+    include(GNUInstallDirs)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(minizip-ng_FOUND TRUE)
+    if(OCIO_minizip-ng_RECOMMENDED_VERSION)
+        set(minizip-ng_VERSION ${OCIO_minizip-ng_RECOMMENDED_VERSION})
+    else()
+        set(minizip-ng_VERSION ${minizip-ng_FIND_VERSION})
+    endif()
+
+    set(minizip-ng_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng")
+
+    # Minizip-ng use a hardcoded lib prefix instead of CMAKE_STATIC_LIBRARY_PREFIX
+    set(_minizip-ng_LIB_PREFIX "lib")
+
+    set(minizip-ng_LIBRARY
+        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${_minizip-ng_LIB_PREFIX}minizip-ng${_minizip-ng_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    if(_minizip-ng_TARGET_CREATE)
+        set(minizip-ng_CMAKE_ARGS
+            ${minizip-ng_CMAKE_ARGS}
+            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
+            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            # Since the other modules create a subfolder for the includes by default and since
+            # minizip-ng does not, a suffix is added to CMAKE_INSTALL_INCLUDEDIR in order to
+            # install the headers under a subdirectory named "minizip-ng".
+            # Note that this does not affect external builds for minizip-ng.
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}/minizip-ng
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+            -DBUILD_SHARED_LIBS=OFF
+            -DMZ_OPENSSL=OFF
+            -DMZ_LIBBSD=OFF
+            -DMZ_BUILD_TESTS=OFF
+            -DMZ_COMPAT=OFF
+            -DMZ_BZIP2=OFF
+            -DMZ_LZMA=OFF
+            -DMZ_LIBCOMP=OFF
+            -DMZ_ZSTD=OFF
+            -DMZ_PKCRYPT=OFF
+            -DMZ_WZAES=OFF
+            -DMZ_SIGNING=OFF
+            -DMZ_ZLIB=ON
+            -DMZ_ICONV=OFF
+            -DMZ_FETCH_LIBS=OFF
+            -DMZ_FORCE_FETCH_LIBS=OFF
+            -DZLIB_LIBRARY=${ZLIB_LIBRARIES}
+            -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS}
+        )
+
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(minizip-ng_CMAKE_ARGS
+                ${minizip-ng_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
+
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+            set(minizip-ng_CMAKE_ARGS
+                ${minizip-ng_CMAKE_ARGS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+            )
+        endif()
+
+        if (ANDROID)
+            set(minizip-ng_CMAKE_ARGS
+                ${minizip-ng_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
+        endif()
+    endif()
+
+    # Hack to let imported target be built from ExternalProject_Add
+    file(MAKE_DIRECTORY ${minizip-ng_INCLUDE_DIR})
+
+    ExternalProject_Add(minizip-ng_install
+        GIT_REPOSITORY "https://github.com/zlib-ng/minizip-ng.git"
+        GIT_TAG "${minizip-ng_VERSION}"
+        GIT_CONFIG advice.detachedHead=false
+        GIT_SHALLOW TRUE
+        PREFIX "${_EXT_BUILD_ROOT}/libminizip-ng"
+        BUILD_BYPRODUCTS ${minizip-ng_LIBRARY}
+        CMAKE_ARGS ${minizip-ng_CMAKE_ARGS}
+        EXCLUDE_FROM_ALL TRUE
+        BUILD_COMMAND ""
+        INSTALL_COMMAND
+            ${CMAKE_COMMAND} --build .
+                            --config ${CMAKE_BUILD_TYPE}
+                            --target install
+                            --parallel
+    )
+
+    add_dependencies(MINIZIP::minizip-ng minizip-ng_install)
+    if(OCIO_VERBOSE)
+        message(STATUS "Installing minizip-ng: ${minizip-ng_LIBRARY} (version \"${minizip-ng_VERSION}\")")
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_minizip-ng_TARGET_CREATE)
+    set_target_properties(MINIZIP::minizip-ng PROPERTIES
+        IMPORTED_LOCATION "${minizip-ng_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${minizip-ng_INCLUDE_DIR}"
+    )
+
+    mark_as_advanced(minizip-ng_INCLUDE_DIR minizip-ng_LIBRARY minizip-ng_VERSION)
+
+    target_link_libraries(MINIZIP::minizip-ng INTERFACE ZLIB::ZLIB)
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installopenfx.cmake b/share/cmake/modules/install/Installopenfx.cmake
new file mode 100644
index 0000000000..36457f346f
--- /dev/null
+++ b/share/cmake/modules/install/Installopenfx.cmake
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install openfx
+#
+# Variables defined by this module:
+#   openfx_FOUND          - Indicate whether the library was found or not
+#   openfx_INCLUDE_DIR    - Location of the header files
+#   openfx_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   openfx::module         
+#
+
+
+###############################################################################
+### Create target
+
+if(NOT TARGET openfx::module)
+    add_library(openfx::module INTERFACE IMPORTED GLOBAL)
+    set(_openfx_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT openfx_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+    include(GNUInstallDirs)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+    set(_openfx_INSTALL_DIR "${_EXT_BUILD_ROOT}/openfx/src/openfx_install")
+
+    # Set find_package standard args
+    set(openfx_FOUND TRUE)
+    if(OCIO_openfx_RECOMMENDED_VERSION)
+        set(openfx_VERSION ${OCIO_openfx_RECOMMENDED_VERSION})
+    else()
+        set(openfx_VERSION ${openfx_FIND_VERSION})
+    endif()
+    set(openfx_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}/openfx")
+
+    if(_openfx_TARGET_CREATE)
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${openfx_INCLUDE_DIR})
+
+        include(VersionUtils)
+        split_version_string(${openfx_VERSION} openfx)
+
+        ExternalProject_Add(openfx_install
+            GIT_REPOSITORY "https://github.com/ofxa/openfx.git"
+            # The latest version from 2015 is OFX_Release_1_4_TAG. 
+            # Need to be careful since older version might have the patch number in the tag.
+            GIT_TAG "OFX_Release_${openfx_VERSION_MAJOR}_${openfx_VERSION_MINOR}_TAG"
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/openfx"
+            BUILD_BYPRODUCTS ${openfx_INCLUDE_DIR}
+            CONFIGURE_COMMAND ""
+            BUILD_COMMAND
+                ${CMAKE_COMMAND} -E copy_directory
+                "${_EXT_BUILD_ROOT}/openfx/src/openfx_install/include"
+                "${openfx_INCLUDE_DIR}"
+            INSTALL_COMMAND ""
+            CMAKE_ARGS ${openfx_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+        )
+
+        add_dependencies(openfx::module openfx_install)
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing openfx: ${openfx_INCLUDE_DIR} (version \"${openfx_VERSION}\")")
+        endif()
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_openfx_TARGET_CREATE)
+    set_target_properties(openfx::module PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES ${openfx_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(openfx_INCLUDE_DIR)
+endif()
diff --git a/share/cmake/modules/install/Installpybind11.cmake b/share/cmake/modules/install/Installpybind11.cmake
new file mode 100644
index 0000000000..9103c215cc
--- /dev/null
+++ b/share/cmake/modules/install/Installpybind11.cmake
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install pybind11
+#
+# Variables defined by this module:
+#   pybind11_FOUND          - Indicate whether the library was found or not
+#   pybind11_INCLUDE_DIR    - Location of the header files
+#   pybind11_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   pybind11::module         
+#
+
+
+###############################################################################
+### Create target ###
+
+if(NOT TARGET pybind11::module)
+    add_library(pybind11::module INTERFACE IMPORTED GLOBAL)
+    set(_pybind11_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT pybind11_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+    include(GNUInstallDirs)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(pybind11_FOUND TRUE)
+    if(OCIO_pybind11_RECOMMENDED_VERSION)
+        set(pybind11_VERSION ${OCIO_pybind11_RECOMMENDED_VERSION})
+    else()
+        set(pybind11_VERSION ${pybind11_FIND_VERSION})
+    endif()
+    set(pybind11_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
+
+    if(_pybind11_TARGET_CREATE)
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${pybind11_INCLUDE_DIR})
+
+        set(pybind11_CMAKE_ARGS
+            ${pybind11_CMAKE_ARGS}
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+            # Using FindPython mode (PYBIND11_FINDPYTHON=ON) doesn't seem to
+            # work when building on docker manylinux images where Development
+            # component is not available but is hardcoded in pybind11 script.
+            -DPYTHON_EXECUTABLE=${Python_EXECUTABLE}
+            -DPYBIND11_INSTALL=ON
+            -DPYBIND11_TEST=OFF
+        )
+
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(pybind11_CMAKE_ARGS
+                ${pybind11_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
+
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+            set(pybind11_CMAKE_ARGS
+                ${pybind11_CMAKE_ARGS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+            )
+        endif()
+
+        if (ANDROID)
+            set(pybind11_CMAKE_ARGS
+                ${pybind11_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
+        endif()
+
+        ExternalProject_Add(pybind11_install
+            GIT_REPOSITORY "https://github.com/pybind/pybind11.git"
+            GIT_TAG "v${pybind11_VERSION}"
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/pybind11"
+            BUILD_BYPRODUCTS ${pybind11_INCLUDE_DIR}
+            CMAKE_ARGS ${pybind11_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+            BUILD_COMMAND ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} --build .
+                                 --config ${CMAKE_BUILD_TYPE}
+                                 --target install
+                                 --parallel
+        )
+
+        add_dependencies(pybind11::module pybind11_install)
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing pybind11: ${pybind11_INCLUDE_DIR} (version \"${pybind11_VERSION}\")")
+        endif()
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_pybind11_TARGET_CREATE)
+    set_target_properties(pybind11::module PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES ${pybind11_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(pybind11_INCLUDE_DIR pybind11_VERSION)
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installpystring.cmake b/share/cmake/modules/install/Installpystring.cmake
new file mode 100644
index 0000000000..f60f490cde
--- /dev/null
+++ b/share/cmake/modules/install/Installpystring.cmake
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install pystring
+#
+# Variables defined by this module:
+#   pystring_FOUND          - Indicate whether the library was found or not
+#   pystring_LIBRARY        - Path to the library file
+#   pystring_INCLUDE_DIR    - Location of the header files
+#   pystring_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   pystring::pystring         
+#
+
+
+###############################################################################
+### Configure target ###
+
+if(NOT TARGET pystring::pystring)
+    add_library(pystring::pystring UNKNOWN IMPORTED GLOBAL)
+    set(_pystring_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT pystring_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(pystring_FOUND TRUE)
+    if(OCIO_pystring_RECOMMENDED_VERSION)
+        set(pystring_VERSION ${OCIO_pystring_RECOMMENDED_VERSION})
+    else()
+        set(pystring_VERSION ${pystring_FIND_VERSION})
+    endif()
+    set(pystring_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
+
+    set(pystring_LIBRARY 
+        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}pystring${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    if(_pystring_TARGET_CREATE)
+        if(MSVC)
+            set(pystring_CXX_FLAGS "${pystring_CXX_FLAGS} /EHsc")
+        endif()
+
+        string(STRIP "${pystring_CXX_FLAGS}" pystring_CXX_FLAGS)
+
+        set(pystring_CMAKE_ARGS
+            ${pystring_CMAKE_ARGS}
+            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
+            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_CXX_FLAGS=${pystring_CXX_FLAGS}
+            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+        )
+
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(pystring_CMAKE_ARGS
+                ${pystring_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
+
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+            set(pystring_CMAKE_ARGS
+                ${pystring_CMAKE_ARGS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+            )
+        endif()
+
+
+        if (ANDROID)
+            set(pystring_CMAKE_ARGS
+                ${pystring_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
+        endif()
+
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${pystring_INCLUDE_DIR})
+
+        ExternalProject_Add(pystring_install
+            GIT_REPOSITORY "https://github.com/imageworks/pystring.git"
+            GIT_TAG "v${pystring_VERSION}"
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/pystring"
+            BUILD_BYPRODUCTS ${pystring_LIBRARY}
+            CMAKE_ARGS ${pystring_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+            PATCH_COMMAND
+                ${CMAKE_COMMAND} -E copy
+                "${CMAKE_SOURCE_DIR}/share/cmake/projects/Buildpystring.cmake"
+                "CMakeLists.txt"
+            BUILD_COMMAND ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} --build .
+                                 --config ${CMAKE_BUILD_TYPE}
+                                 --target install
+                                 --parallel
+        )
+
+        add_dependencies(pystring::pystring pystring_install)
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing pystring: ${pystring_LIBRARY} (version \"${pystring_VERSION}\")")
+        endif()
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_pystring_TARGET_CREATE)
+    set_target_properties(pystring::pystring PROPERTIES
+        IMPORTED_LOCATION ${pystring_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES ${pystring_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(pystring_INCLUDE_DIR pystring_LIBRARY pystring_VERSION)
+endif()
diff --git a/share/cmake/modules/install/Installyaml-cpp.cmake b/share/cmake/modules/install/Installyaml-cpp.cmake
new file mode 100644
index 0000000000..f58ff12b3a
--- /dev/null
+++ b/share/cmake/modules/install/Installyaml-cpp.cmake
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install yaml-cpp
+#
+# Variables defined by this module:
+#   yaml-cpp_FOUND          - Indicate whether the library was found or not
+#   yaml-cpp_LIBRARY        - Path to the library file
+#   yaml-cpp_INCLUDE_DIR    - Location of the header files
+#   yaml-cpp_VERSION        - Library's version
+#
+# Global targets defined by this module:
+#   yaml-cpp::yaml-cpp         
+#
+
+###############################################################################
+### Create target (if previous 'find_package' call hasn't) ###
+
+if(NOT TARGET yaml-cpp)
+    add_library(yaml-cpp UNKNOWN IMPORTED GLOBAL)
+    set(_yaml-cpp_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Install package from source ###
+
+if(NOT yaml-cpp_FOUND AND OCIO_INSTALL_EXT_PACKAGES AND NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL NONE)
+    include(ExternalProject)
+    include(GNUInstallDirs)
+
+    set(_EXT_DIST_ROOT "${CMAKE_BINARY_DIR}/ext/dist")
+    set(_EXT_BUILD_ROOT "${CMAKE_BINARY_DIR}/ext/build")
+
+    # Set find_package standard args
+    set(yaml-cpp_FOUND TRUE)
+    if(OCIO_yaml-cpp_RECOMMENDED_VERSION)
+        set(yaml-cpp_VERSION ${OCIO_yaml-cpp_RECOMMENDED_VERSION})
+    else()
+        set(yaml-cpp_VERSION ${yaml-cpp_FIND_VERSION})
+    endif()
+    set(yaml-cpp_INCLUDE_DIR "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_INCLUDEDIR}")
+
+    # Starting from 0.7.0, this is included on all platforms, we could also
+    # override CMAKE_DEBUG_POSTFIX to bypass it.
+    if(BUILD_TYPE_DEBUG)
+        string(APPEND _yaml-cpp_LIB_SUFFIX "d")
+    endif()
+
+    # Set the expected library name
+    set(yaml-cpp_LIBRARY
+        "${_EXT_DIST_ROOT}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}yaml-cpp${_yaml-cpp_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    if(_yaml-cpp_TARGET_CREATE)
+        if(MSVC)
+            set(yaml-cpp_CXX_FLAGS "${yaml-cpp_CXX_FLAGS} /EHsc")
+        endif()
+
+        if(UNIX)
+            if(USE_CLANG)
+                # Remove some global 'shadow' warnings.
+                set(yaml-cpp_CXX_FLAGS "${yaml-cpp_CXX_FLAGS} -Wno-shadow")
+            endif()
+        endif()
+
+        string(STRIP "${yaml-cpp_CXX_FLAGS}" yaml-cpp_CXX_FLAGS)
+
+        set(yaml-cpp_CMAKE_ARGS
+            ${yaml-cpp_CMAKE_ARGS}
+            -DCMAKE_POLICY_DEFAULT_CMP0063=NEW
+            -DCMAKE_CXX_VISIBILITY_PRESET=${CMAKE_CXX_VISIBILITY_PRESET}
+            -DCMAKE_VISIBILITY_INLINES_HIDDEN=${CMAKE_VISIBILITY_INLINES_HIDDEN}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+            -DCMAKE_CXX_FLAGS=${yaml-cpp_CXX_FLAGS}
+            -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
+            -DCMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE}
+            -DCMAKE_INSTALL_PREFIX=${_EXT_DIST_ROOT}
+            -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+            -DCMAKE_INSTALL_DATADIR=${CMAKE_INSTALL_DATADIR}
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+            -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+            -DCMAKE_OBJECT_PATH_MAX=${CMAKE_OBJECT_PATH_MAX}
+            -DBUILD_SHARED_LIBS=OFF
+            -DYAML_BUILD_SHARED_LIBS=OFF
+            -DYAML_CPP_BUILD_TESTS=OFF
+            -DYAML_CPP_BUILD_TOOLS=OFF
+            -DYAML_CPP_BUILD_CONTRIB=OFF
+        )
+
+        if(CMAKE_TOOLCHAIN_FILE)
+            set(yaml-cpp_CMAKE_ARGS
+                ${yaml-cpp_CMAKE_ARGS} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+        endif()
+
+        if(APPLE)
+            string(REPLACE ";" "$<SEMICOLON>" ESCAPED_CMAKE_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+            set(yaml-cpp_CMAKE_ARGS
+                ${yaml-cpp_CMAKE_ARGS}
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}
+                -DCMAKE_OSX_ARCHITECTURES=${ESCAPED_CMAKE_OSX_ARCHITECTURES}
+            )
+        endif()
+
+
+        if (ANDROID)
+            set(yaml-cpp_CMAKE_ARGS
+                ${yaml-cpp_CMAKE_ARGS}
+                -DANDROID_PLATFORM=${ANDROID_PLATFORM}
+                -DANDROID_ABI=${ANDROID_ABI}
+                -DANDROID_STL=${ANDROID_STL})
+        endif()
+
+        set(yaml-cpp_GIT_TAG "yaml-cpp-${yaml-cpp_VERSION}")
+
+        # Hack to let imported target be built from ExternalProject_Add
+        file(MAKE_DIRECTORY ${yaml-cpp_INCLUDE_DIR})
+
+        ExternalProject_Add(yaml-cpp_install
+            GIT_REPOSITORY "https://github.com/jbeder/yaml-cpp.git"
+            GIT_TAG ${yaml-cpp_GIT_TAG}
+            GIT_CONFIG advice.detachedHead=false
+            GIT_SHALLOW TRUE
+            PREFIX "${_EXT_BUILD_ROOT}/yaml-cpp"
+            BUILD_BYPRODUCTS ${yaml-cpp_LIBRARY}
+            CMAKE_ARGS ${yaml-cpp_CMAKE_ARGS}
+            EXCLUDE_FROM_ALL TRUE
+            BUILD_COMMAND ""
+            INSTALL_COMMAND
+                ${CMAKE_COMMAND} --build .
+                                 --config ${CMAKE_BUILD_TYPE}
+                                 --target install
+                                 --parallel
+        )
+
+        add_dependencies(yaml-cpp yaml-cpp_install)
+        if(OCIO_VERBOSE)
+            message(STATUS "Installing yaml-cpp: ${yaml-cpp_LIBRARY} (version \"${yaml-cpp_VERSION}\")")
+        endif()
+    endif()
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_yaml-cpp_TARGET_CREATE)
+    set_target_properties(yaml-cpp PROPERTIES
+        IMPORTED_LOCATION ${yaml-cpp_LIBRARY}
+        INTERFACE_INCLUDE_DIRECTORIES ${yaml-cpp_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(yaml-cpp_INCLUDE_DIR yaml-cpp_LIBRARY yaml-cpp_VERSION)
+endif()
diff --git a/share/cmake/utils/CheckSupportGL.cmake b/share/cmake/utils/CheckSupportGL.cmake
index 0f2678da11..35a1146981 100644
--- a/share/cmake/utils/CheckSupportGL.cmake
+++ b/share/cmake/utils/CheckSupportGL.cmake
@@ -7,28 +7,30 @@
 
 include(PackageUtils)
 include(SelectLibraryConfigurations)
+include(Colors)
+include(ocio_handle_dependency)
 
 if((OCIO_BUILD_TESTS AND OCIO_BUILD_GPU_TESTS) OR OCIO_BUILD_APPS)
     set(OCIO_GL_ENABLED ON)
     set(OCIO_USE_GLVND OFF)
     set(OCIO_EGL_HEADLESS OFF)
 
-    find_package(OpenGL COMPONENTS OpenGL)
+    ocio_handle_dependency(OpenGL COMPONENTS OpenGL)
     if(NOT OpenGL_OpenGL_FOUND AND NOT OPENGL_GLU_FOUND)
         package_root_message(OpenGL)
         set(OCIO_GL_ENABLED OFF)
     endif()
 
     if(NOT APPLE)
-        # On some Linux platform, the glew-config.cmake is found first so make it explicit
-        # to fall back on the regular search if not found.
-        find_package(GLEW CONFIG QUIET)
+        # On some Linux platform, the glew-config.cmake is found first.
+        # PREFER_CONFIG assure that config mode will be used first and fallback to module mode.
+        ocio_handle_dependency(  GLEW PREFER_CONFIG
+                                 MIN_VERSION 2.1.0
+                                 RECOMMENDED_VERSION 2.1.0
+                                 RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
         if(NOT GLEW_FOUND)
-            find_package(GLEW)
-            if(NOT GLEW_FOUND)
-                package_root_message(GLEW)
-                set(OCIO_GL_ENABLED OFF)
-            endif()
+            package_root_message(GLEW)
+            set(OCIO_GL_ENABLED OFF)
         else()
             # Expected variables GLEW_LIBRARIES and GLEW_INCLUDE_DIRS are missing so create
             # the mandatory one.
@@ -67,7 +69,10 @@ if((OCIO_BUILD_TESTS AND OCIO_BUILD_GPU_TESTS) OR OCIO_BUILD_APPS)
         endif()
     endif()
 
-    find_package(GLUT)
+    ocio_handle_dependency(  GLUT
+                             MIN_VERSION 3.2.1
+                             RECOMMENDED_VERSION 3.2.1
+                             RECOMMENDED_VERSION_REASON "Latest version tested with OCIO")
     if(NOT GLUT_FOUND)
         package_root_message(GLUT)
         set(OCIO_GL_ENABLED OFF)
@@ -90,7 +95,7 @@ if((OCIO_BUILD_TESTS AND OCIO_BUILD_GPU_TESTS) OR OCIO_BUILD_APPS)
                     message(STATUS "Can't find EGL without GLVND support; can't render headlessly")
                     set(OCIO_USE_HEADLESS OFF)
                 else()
-                    find_package(OpenGL COMPONENTS EGL)
+                    ocio_handle_dependency(OpenGL COMPONENTS EGL)
                     if(NOT OpenGL_EGL_FOUND)
                         message(WARNING "EGL component missing; can't render headlessly")
                         set(OCIO_USE_HEADLESS OFF)
diff --git a/share/cmake/utils/Colors.cmake b/share/cmake/utils/Colors.cmake
new file mode 100644
index 0000000000..888ac6e5b9
--- /dev/null
+++ b/share/cmake/utils/Colors.cmake
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+string (ASCII 27 ColorEsc)
+
+set(ColorReset       "${ColorEsc}[m")
+
+# Error
+set(ColorError       "${ColorEsc}[31m")
+# Green
+set(ColorSuccess     "${ColorEsc}[32m")
+# Yellow
+set(ColorWarning     "${ColorEsc}[33m")
+set(ColorInfo        "${ColorEsc}[34m")
+
+set(ColorBoldWhite   "${ColorEsc}[1;37m")
\ No newline at end of file
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 41d96838f7..b11722f847 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -93,6 +93,8 @@ set_unless_defined(CMAKE_VISIBILITY_INLINES_HIDDEN YES)
 ###############################################################################
 # Define if SSE2 can be used.
 
+message(STATUS "")
+message(STATUS "Checking for SSE2 support...")
 include(CheckSupportSSE2)
 
 if(NOT HAVE_SSE2)
diff --git a/src/cmake/Config.cmake.in b/src/cmake/Config.cmake.in
index 6a4932a836..a3b03cc83d 100644
--- a/src/cmake/Config.cmake.in
+++ b/src/cmake/Config.cmake.in
@@ -54,61 +54,12 @@ if (NOT @BUILD_SHARED_LIBS@) # NOT @BUILD_SHARED_LIBS@
         # ZLIB_VERSION is available starting CMake 3.26+.
         # ZLIB_VERSION_STRING is still available for backward compatibility.
         # See https://cmake.org/cmake/help/git-stage/module/FindZLIB.html
-
-        # ZLIB_USE_STATIC_LIBS is supported only from CMake 3.24+.
-        if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24.0") 
-            if (ZLIB_STATIC_LIBRARY)
-                set(ZLIB_USE_STATIC_LIBS "${ZLIB_STATIC_LIBRARY}")
-            endif()
-        else() # For CMake < 3.24 since ZLIB_USE_STATIC_LIBS is not available.
-            if(NOT ZLIB_LIBRARY)
-                if(DEFINED CMAKE_FIND_LIBRARY_PREFIXES)
-                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}")
-                else()
-                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
-                endif()
-
-                if(DEFINED CMAKE_FIND_LIBRARY_SUFFIXES)
-                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_FIND_LIBRARY_SUFFIXES}")
-                else()
-                    set(_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
-                endif()
-
-                # Prefix/suffix for windows.
-                if(WIN32)
-                    list(APPEND CMAKE_FIND_LIBRARY_PREFIXES "" "lib")
-                    list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES ".dll.a")
-                endif()
-
-                # Check if static lib is preferred.
-                if(ZLIB_STATIC_LIBRARY OR ZLIB_USE_STATIC_LIBS)
-                    if(WIN32)
-                        set(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
-                    else()
-                        set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
-                    endif()
-                endif()
-            endif()
-        endif()
-
+        
         if (@ZLIB_VERSION@) # @ZLIB_VERSION@
             find_dependency(ZLIB @ZLIB_VERSION@)
         else()
             find_dependency(ZLIB @ZLIB_VERSION_STRING@)
         endif()
-
-        # Restore the original find library ordering
-        if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES)
-            set(CMAKE_FIND_LIBRARY_SUFFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}")
-        else()
-            set(CMAKE_FIND_LIBRARY_SUFFIXES)
-        endif()
-
-        if(DEFINED _ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES)
-            set(CMAKE_FIND_LIBRARY_PREFIXES "${_ZLIB_ORIG_CMAKE_FIND_LIBRARY_PREFIXES}")
-        else()
-            set(CMAKE_FIND_LIBRARY_PREFIXES)
-        endif()
     endif()
 
     if (NOT MINIZIP::minizip-ng)

From 01b7496abd01cbc7e0a56b4cd57912b0e443e68f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Achard?= <remiachard@gmail.com>
Date: Wed, 22 Mar 2023 23:07:48 +0000
Subject: [PATCH 52/81] Fix python packaging (#1778)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Achard <remiachard@gmail.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 MANIFEST.in                                   |  3 ++
 pyproject.toml                                |  2 +-
 setup.py                                      |  9 +++---
 src/bindings/python/CMakeLists.txt            | 32 ++++++++++++-------
 src/bindings/python/{ => package}/__init__.py | 10 +++++-
 tests/python/OpenColorIOTest.py               | 12 +++++++
 tests/python/OpenColorIOTestSuite.py          | 12 ++-----
 tests/python/TransformsTest.py                |  2 +-
 8 files changed, 55 insertions(+), 27 deletions(-)
 rename src/bindings/python/{ => package}/__init__.py (82%)

diff --git a/MANIFEST.in b/MANIFEST.in
index d72188ba8f..cf2223275b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,3 +6,6 @@ graft share
 graft src
 graft tests
 global-include CMakeLists.txt *.cmake *.md
+global-exclude */__pycache__/*
+global-exclude *.pyc
+global-exclude .DS_Store
diff --git a/pyproject.toml b/pyproject.toml
index ff83ac1a0d..e19afc34bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ build-backend = "setuptools.build_meta"
 [tool.cibuildwheel]
 build-verbosity = "1"
 
-test-command = "python -m PyOpenColorIOTests.OpenColorIOTestSuite"
+test-command = "python -m PyOpenColorIO.tests.OpenColorIOTestSuite"
 test-requires = ["numpy"]
 
 manylinux-x86_64-image = "manylinux2014"
diff --git a/setup.py b/setup.py
index 80f4b29e1f..614752a527 100644
--- a/setup.py
+++ b/setup.py
@@ -155,11 +155,12 @@ def build_extension(self, ext):
 setup(
     version=get_version(),
     package_dir={
-        'PyOpenColorIOTests': 'tests/python',
-        'PyOpenColorIOTests.data': 'tests/data',
+        'PyOpenColorIO': 'src/bindings/python/package',
+        'PyOpenColorIO.tests': 'tests/python',
+        'PyOpenColorIO.data': 'tests/data',
     },
-    packages=['PyOpenColorIOTests', 'PyOpenColorIOTests.data'],
-    ext_modules=[CMakeExtension("PyOpenColorIO")],
+    packages=['PyOpenColorIO', 'PyOpenColorIO.tests', 'PyOpenColorIO.data'],
+    ext_modules=[CMakeExtension("PyOpenColorIO.PyOpenColorIO")],
     cmdclass={"build_ext": CMakeBuild},
     include_package_data=True
 )
diff --git a/src/bindings/python/CMakeLists.txt b/src/bindings/python/CMakeLists.txt
index 8588700a0c..436299cd95 100644
--- a/src/bindings/python/CMakeLists.txt
+++ b/src/bindings/python/CMakeLists.txt
@@ -131,13 +131,6 @@ if(WIN32)
 	)
 endif()
 
-# NOTE: Depending of the compiler version pybind11 2.4.3 does not compile with C++17 so revert to c++11
-
-set(APP_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-if(${CMAKE_CXX_STANDARD} GREATER_EQUAL 17)
-	set(APP_CXX_STANDARD 11)
-endif()
-
 set(CUSTOM_COMPILE_FLAGS ${PLATFORM_COMPILE_OPTIONS})
 set(CUSTOM_LINK_FLAGS ${PLATFORM_LINK_OPTIONS})
 
@@ -159,7 +152,6 @@ set_target_properties(PyOpenColorIO
 	PROPERTIES
 		COMPILE_OPTIONS "${CUSTOM_COMPILE_FLAGS}"
 		LINK_OPTIONS "${CUSTOM_LINK_FLAGS}"
-		CXX_STANDARD ${APP_CXX_STANDARD}
 )
 
 if(NOT BUILD_SHARED_LIBS)
@@ -183,10 +175,10 @@ if (UNIX AND NOT CMAKE_SKIP_RPATH)
     # dynamic library based on the default installation directory structure.
     if (APPLE)
         set_target_properties(PyOpenColorIO PROPERTIES
-            INSTALL_RPATH "@loader_path/../..;${CMAKE_INSTALL_RPATH}")
+            INSTALL_RPATH "@loader_path/../../..;${CMAKE_INSTALL_RPATH}")
     else()
         set_target_properties(PyOpenColorIO PROPERTIES
-            INSTALL_RPATH "$ORIGIN/../..;${CMAKE_INSTALL_RPATH}")
+            INSTALL_RPATH "$ORIGIN/../../..;${CMAKE_INSTALL_RPATH}")
     endif()
 endif()
 
@@ -227,6 +219,24 @@ target_compile_definitions(PyOpenColorIO
 		PY_VERSION_PATCH=${Python_VERSION_PATCH}
 )
 
+###############################################################################
+# Build layout
+# Mirrors the installation, using PyOpenColorIO folder and __init__.py file.
+# When building the Python wheel, do not override the target directory.
+set(_PyOpenColorIO_BUILD_PACKAGE_DIR  "${CMAKE_CURRENT_BINARY_DIR}/PyOpenColorIO")
+if(NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY)
+	set_target_properties(PyOpenColorIO PROPERTIES
+		LIBRARY_OUTPUT_DIRECTORY "${_PyOpenColorIO_BUILD_PACKAGE_DIR}"
+		# For Windows compatibility
+		LIBRARY_OUTPUT_DIRECTORY_DEBUG "${_PyOpenColorIO_BUILD_PACKAGE_DIR}"
+		LIBRARY_OUTPUT_DIRECTORY_RELEASE "${_PyOpenColorIO_BUILD_PACKAGE_DIR}"
+	)
+endif()
+
+file(COPY package/__init__.py DESTINATION "${_PyOpenColorIO_BUILD_PACKAGE_DIR}")
+
+###############################################################################
+# Install layout
 # Set to site-package location.
 if(WIN32)
     set(_Python_VARIANT_PATH "${CMAKE_INSTALL_LIBDIR}/site-packages")
@@ -245,4 +255,4 @@ install(TARGETS PyOpenColorIO
     LIBRARY DESTINATION ${_PyOpenColorIO_SITE_PACKAGE_DIR}
 )
 
-install(FILES __init__.py DESTINATION ${_PyOpenColorIO_SITE_PACKAGE_DIR})
\ No newline at end of file
+install(FILES package/__init__.py DESTINATION ${_PyOpenColorIO_SITE_PACKAGE_DIR})
\ No newline at end of file
diff --git a/src/bindings/python/__init__.py b/src/bindings/python/package/__init__.py
similarity index 82%
rename from src/bindings/python/__init__.py
rename to src/bindings/python/package/__init__.py
index 3ce17e051e..3ac093f5c3 100644
--- a/src/bindings/python/__init__.py
+++ b/src/bindings/python/package/__init__.py
@@ -21,4 +21,12 @@
         if os.path.exists(path) and path != ".":
             os.add_dll_directory(path)
 
-from .PyOpenColorIO import *
\ No newline at end of file
+del os, sys, platform
+
+#
+# Import compiled module.
+#
+
+from .PyOpenColorIO import __author__, __email__, __license__, __copyright__, __version__, __status__, __doc__
+
+from .PyOpenColorIO import *
diff --git a/tests/python/OpenColorIOTest.py b/tests/python/OpenColorIOTest.py
index 0b6af12a24..8a7016290c 100644
--- a/tests/python/OpenColorIOTest.py
+++ b/tests/python/OpenColorIOTest.py
@@ -11,6 +11,18 @@
 
 class OpenColorIOTest(unittest.TestCase):
 
+    def test_attributes(self):
+        """
+        Test Global attributes.
+        """
+        self.assertTrue(hasattr(OCIO, "__author__"))
+        self.assertTrue(hasattr(OCIO, "__email__"))
+        self.assertTrue(hasattr(OCIO, "__license__"))
+        self.assertTrue(hasattr(OCIO, "__copyright__"))
+        self.assertTrue(hasattr(OCIO, "__version__"))
+        self.assertTrue(hasattr(OCIO, "__status__"))
+        self.assertTrue(hasattr(OCIO, "__doc__"))
+
     def test_env_variable(self):
         """
         Test Get/SetEnvVariable().
diff --git a/tests/python/OpenColorIOTestSuite.py b/tests/python/OpenColorIOTestSuite.py
index a9aaa5c930..995706f0a4 100755
--- a/tests/python/OpenColorIOTestSuite.py
+++ b/tests/python/OpenColorIOTestSuite.py
@@ -24,14 +24,8 @@
         # Note: Only when compiling within Microsoft Visual Studio editor i.e. not on command line.
         if len(sys.argv) == 3:
             opencolorio_dir = os.path.join(opencolorio_dir, sys.argv[2])
-            pyopencolorio_dir = os.path.join(pyopencolorio_dir, sys.argv[2])
-
-        # Python 3.8+ does no longer look for DLLs in PATH environment variable
-        if hasattr(os, 'add_dll_directory'):
-            os.add_dll_directory(opencolorio_dir)
-        else:
-            os.environ['PATH'] = '{0};{1}'.format(
-                opencolorio_dir, os.getenv('PATH', ''))
+        # PyOpenColorIO __init__.py file handle os.add_dll_directory()
+        os.environ['PATH'] = '{0};{1}'.format(opencolorio_dir, os.getenv('PATH', ''))
     elif sys.platform == 'darwin':
         # On OSX we must add the main library location to DYLD_LIBRARY_PATH
         os.environ['DYLD_LIBRARY_PATH'] = '{0}:{1}'.format(
@@ -41,7 +35,7 @@
 # Else it probably means direct invocation from installed package
 else:
     here = os.path.dirname(__file__)
-    os.environ["TEST_DATAFILES_DIR"] = os.path.join(here, 'data', 'files')
+    os.environ["TEST_DATAFILES_DIR"] = os.path.join(os.path.dirname(here), 'data', 'files')
     sys.path.insert(0, here)
 
 import PyOpenColorIO as OCIO
diff --git a/tests/python/TransformsTest.py b/tests/python/TransformsTest.py
index d4b1b35962..9354eec6e1 100644
--- a/tests/python/TransformsTest.py
+++ b/tests/python/TransformsTest.py
@@ -34,7 +34,7 @@ def all_transforms_as_group(self):
                     # Ensure we only catch and filter for this specific error
                     self.assertEqual(
                         str(e),
-                        'PyOpenColorIO.Transform: No constructor defined!',
+                        'PyOpenColorIO.PyOpenColorIO.Transform: No constructor defined!',
                         'Unintended Error Raised: {0}'.format(e)
                     )
 

From db33ae0c9b093cb92a9f93586b0d5fe3d95c5864 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Achard?= <remiachard@gmail.com>
Date: Thu, 23 Mar 2023 03:52:03 +0000
Subject: [PATCH 53/81] Fix GradingPrimary bypass and add support Python
 comparison for Grading data objects (#1779)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix GradingPrimary bypass, support comparison in Python

Signed-off-by: Rémi Achard <remiachard@gmail.com>

* Improve test description

Signed-off-by: Rémi Achard <remiachard@gmail.com>

---------

Signed-off-by: Rémi Achard <remiachard@gmail.com>
Co-authored-by: Doug Walker <doug.walker@autodesk.com>
Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 include/OpenColorIO/OpenColorTransforms.h     | 14 ++++
 .../ops/gradingprimary/GradingPrimary.cpp     |  7 +-
 .../ops/gradingprimary/GradingPrimary.h       |  5 --
 .../ops/gradingrgbcurve/GradingBSplineCurve.h |  6 --
 .../ops/gradingrgbcurve/GradingRGBCurve.h     |  2 -
 src/OpenColorIO/ops/gradingtone/GradingTone.h |  5 --
 src/bindings/python/PyGradingData.cpp         | 64 +++++++++++++++++++
 .../gradingprimary/GradingPrimary_tests.cpp   | 40 ++++++++++++
 tests/python/GradingDataTest.py               | 50 ++++++++++++++-
 tests/python/GradingPrimaryTransformTest.py   |  8 +--
 10 files changed, 175 insertions(+), 26 deletions(-)

diff --git a/include/OpenColorIO/OpenColorTransforms.h b/include/OpenColorIO/OpenColorTransforms.h
index 9d70664c52..d4587149ec 100644
--- a/include/OpenColorIO/OpenColorTransforms.h
+++ b/include/OpenColorIO/OpenColorTransforms.h
@@ -453,6 +453,8 @@ struct OCIOEXPORT GradingRGBM
     double m_master{ 0. };
 };
 
+extern OCIOEXPORT bool operator==(const GradingRGBM & lhs, const GradingRGBM & rhs);
+extern OCIOEXPORT bool operator!=(const GradingRGBM & lhs, const GradingRGBM & rhs);
 extern OCIOEXPORT std::ostream & operator<<(std::ostream &, const GradingRGBM &);
 
 /// Grading primary values.
@@ -488,6 +490,8 @@ struct OCIOEXPORT GradingPrimary
     static double NoClampWhite();
 };
 
+extern OCIOEXPORT bool operator==(const GradingPrimary & lhs, const GradingPrimary & rhs);
+extern OCIOEXPORT bool operator!=(const GradingPrimary & lhs, const GradingPrimary & rhs);
 extern OCIOEXPORT std::ostream & operator<<(std::ostream &, const GradingPrimary &);
 
 /// 2D control point used by \ref GradingBSplineCurve.
@@ -499,6 +503,8 @@ struct OCIOEXPORT GradingControlPoint
     float m_y{ 0.f };
 };
 
+extern OCIOEXPORT bool operator==(const GradingControlPoint & lhs, const GradingControlPoint & rhs);
+extern OCIOEXPORT bool operator!=(const GradingControlPoint & lhs, const GradingControlPoint & rhs);
 extern OCIOEXPORT std::ostream & operator<<(std::ostream &, const GradingControlPoint &);
 
 /// A BSpline curve defined with \ref GradingControlPoint.
@@ -530,6 +536,8 @@ class OCIOEXPORT GradingBSplineCurve
     GradingBSplineCurve() = default;
 };
 
+extern OCIOEXPORT bool operator==(const GradingBSplineCurve & lhs, const GradingBSplineCurve & rhs);
+extern OCIOEXPORT bool operator!=(const GradingBSplineCurve & lhs, const GradingBSplineCurve & rhs);
 extern OCIOEXPORT std::ostream & operator<<(std::ostream &, const GradingBSplineCurve &);
 
 /**
@@ -559,6 +567,8 @@ class OCIOEXPORT GradingRGBCurve
     GradingRGBCurve() = default;
 };
 
+extern OCIOEXPORT bool operator==(const GradingRGBCurve & lhs, const GradingRGBCurve & rhs);
+extern OCIOEXPORT bool operator!=(const GradingRGBCurve & lhs, const GradingRGBCurve & rhs);
 extern OCIOEXPORT std::ostream & operator<<(std::ostream &, const GradingRGBCurve &);
 
 /**
@@ -602,6 +612,8 @@ struct OCIOEXPORT GradingRGBMSW
     double m_width { 1. }; // Or pivot for shadows and highlights.
 };
 
+extern OCIOEXPORT bool operator==(const GradingRGBMSW & lhs, const GradingRGBMSW & rhs);
+extern OCIOEXPORT bool operator!=(const GradingRGBMSW & lhs, const GradingRGBMSW & rhs);
 extern OCIOEXPORT std::ostream & operator<<(std::ostream &, const GradingRGBMSW &);
 
 /// Grading tone values.
@@ -641,6 +653,8 @@ struct OCIOEXPORT GradingTone
     double m_scontrast{ 1.0 };
 };
 
+extern OCIOEXPORT bool operator==(const GradingTone & lhs, const GradingTone & rhs);
+extern OCIOEXPORT bool operator!=(const GradingTone & lhs, const GradingTone & rhs);
 extern OCIOEXPORT std::ostream & operator<<(std::ostream &, const GradingTone &);
 
 /**
diff --git a/src/OpenColorIO/ops/gradingprimary/GradingPrimary.cpp b/src/OpenColorIO/ops/gradingprimary/GradingPrimary.cpp
index 8f70916b22..788640b7f7 100644
--- a/src/OpenColorIO/ops/gradingprimary/GradingPrimary.cpp
+++ b/src/OpenColorIO/ops/gradingprimary/GradingPrimary.cpp
@@ -105,7 +105,8 @@ void GradingPrimaryPreRender::update(GradingStyle style,
                                      TransformDirection dir,
                                      const GradingPrimary & v) noexcept
 {
-    m_localBypass = v.m_clampBlack == GradingPrimary::NoClampBlack() &&
+    m_localBypass = v.m_saturation == 1. &&
+                    v.m_clampBlack == GradingPrimary::NoClampBlack() &&
                     v.m_clampWhite == GradingPrimary::NoClampWhite();
 
     switch (style)
@@ -189,7 +190,7 @@ void GradingPrimaryPreRender::update(GradingStyle style,
             break;
         }
         }
-        m_isPowerIdentity = m_contrast[0] == 1.0f || m_contrast[1] == 1.0f ||
+        m_isPowerIdentity = m_contrast[0] == 1.0f && m_contrast[1] == 1.0f &&
                             m_contrast[2] == 1.0f;
         m_pivot = 0.18 * std::pow(2., v.m_pivot);
         m_localBypass = m_localBypass && m_isPowerIdentity &&
@@ -249,7 +250,7 @@ void GradingPrimaryPreRender::update(GradingStyle style,
             break;
         }
         }
-        m_isPowerIdentity = m_gamma[0] == 1.0f || m_gamma[1] == 1.0f || m_gamma[2] == 1.0f;
+        m_isPowerIdentity = m_gamma[0] == 1.0f && m_gamma[1] == 1.0f && m_gamma[2] == 1.0f;
         m_localBypass = m_localBypass && m_isPowerIdentity &&
                         m_slope[0] == 1.f && m_slope[1] == 1.f && m_slope[2] == 1.f &&
                         m_offset[0] == 0.f && m_offset[1] == 0.f && m_offset[2] == 0.f;
diff --git a/src/OpenColorIO/ops/gradingprimary/GradingPrimary.h b/src/OpenColorIO/ops/gradingprimary/GradingPrimary.h
index 9a47be5a53..405b84d1da 100644
--- a/src/OpenColorIO/ops/gradingprimary/GradingPrimary.h
+++ b/src/OpenColorIO/ops/gradingprimary/GradingPrimary.h
@@ -62,11 +62,6 @@ struct GradingPrimaryPreRender
     bool m_localBypass{ false };
 };
 
-bool operator==(const GradingRGBM & lhs, const GradingRGBM & rhs);
-bool operator!=(const GradingRGBM & lhs, const GradingRGBM & rhs);
-bool operator==(const GradingPrimary & lhs, const GradingPrimary & rhs);
-bool operator!=(const GradingPrimary & lhs, const GradingPrimary & rhs);
-
 } // namespace OCIO_NAMESPACE
 
 #endif
diff --git a/src/OpenColorIO/ops/gradingrgbcurve/GradingBSplineCurve.h b/src/OpenColorIO/ops/gradingrgbcurve/GradingBSplineCurve.h
index 2c4fe4aba7..9944564051 100644
--- a/src/OpenColorIO/ops/gradingrgbcurve/GradingBSplineCurve.h
+++ b/src/OpenColorIO/ops/gradingrgbcurve/GradingBSplineCurve.h
@@ -126,12 +126,6 @@ class GradingBSplineCurveImpl : public GradingBSplineCurve
 
 bool IsGradingCurveIdentity(const ConstGradingBSplineCurveRcPtr & curve);
 
-bool operator==(const GradingControlPoint & lhs, const GradingControlPoint & rhs);
-bool operator!=(const GradingControlPoint & lhs, const GradingControlPoint & rhs);
-bool operator==(const GradingBSplineCurve & lhs, const GradingBSplineCurve & rhs);
-bool operator!=(const GradingBSplineCurve & lhs, const GradingBSplineCurve & rhs);
-
-
 } // namespace OCIO_NAMESPACE
 
 #endif // INCLUDED_OCIO_GRADINGBSPLINECURVE_H
diff --git a/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurve.h b/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurve.h
index 67b6ea5a12..3a7c76dfee 100644
--- a/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurve.h
+++ b/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurve.h
@@ -43,8 +43,6 @@ class GradingRGBCurveImpl : public GradingRGBCurve
 typedef OCIO_SHARED_PTR<const GradingRGBCurveImpl> ConstGradingRGBCurveImplRcPtr;
 typedef OCIO_SHARED_PTR<GradingRGBCurveImpl> GradingRGBCurveImplRcPtr;
 
-bool operator==(const GradingRGBCurve & lhs, const GradingRGBCurve & rhs);
-bool operator!=(const GradingRGBCurve & lhs, const GradingRGBCurve & rhs);
 }
 
 #endif //INCLUDED_OCIO_GRADINGRGBCURVE_H
diff --git a/src/OpenColorIO/ops/gradingtone/GradingTone.h b/src/OpenColorIO/ops/gradingtone/GradingTone.h
index 2be77fa9e9..b14f71869b 100644
--- a/src/OpenColorIO/ops/gradingtone/GradingTone.h
+++ b/src/OpenColorIO/ops/gradingtone/GradingTone.h
@@ -12,11 +12,6 @@
 namespace OCIO_NAMESPACE
 {
 
-bool operator==(const GradingRGBMSW & lhs, const GradingRGBMSW & rhs);
-bool operator!=(const GradingRGBMSW & lhs, const GradingRGBMSW & rhs);
-bool operator==(const GradingTone & lhs, const GradingTone & rhs);
-bool operator!=(const GradingTone & lhs, const GradingTone & rhs);
-
 enum RGBMChannel
 {
     R = 0,
diff --git a/src/bindings/python/PyGradingData.cpp b/src/bindings/python/PyGradingData.cpp
index ae68c6e0e1..815da41140 100644
--- a/src/bindings/python/PyGradingData.cpp
+++ b/src/bindings/python/PyGradingData.cpp
@@ -2,6 +2,7 @@
 // Copyright Contributors to the OpenColorIO Project.
 
 #include <sstream>
+
 #include "PyOpenColorIO.h"
 #include "PyUtils.h"
 
@@ -74,6 +75,15 @@ void bindPyGradingData(py::module & m)
              "red"_a, "green"_a, "blue"_a, "master"_a,
              DOC(GradingRGBM, GradingRGBM, 2))
 
+        .def("__eq__", [](const GradingRGBM &self, const GradingRGBM &other)
+            {
+                return self == other;
+            }, py::is_operator())
+        .def("__ne__", [](const GradingRGBM &self, const GradingRGBM &other)
+            {
+                return self != other;
+            }, py::is_operator())
+
         .def_readwrite("red", &GradingRGBM::m_red, 
                        DOC(GradingRGBM, m_red))
         .def_readwrite("green", &GradingRGBM::m_green, 
@@ -89,6 +99,15 @@ void bindPyGradingData(py::module & m)
         .def(py::init<GradingStyle>(), 
              DOC(GradingPrimary, GradingPrimary))
 
+        .def("__eq__", [](const GradingPrimary &self, const GradingPrimary &other)
+            {
+                return self == other;
+            }, py::is_operator())
+        .def("__ne__", [](const GradingPrimary &self, const GradingPrimary &other)
+            {
+                return self != other;
+            }, py::is_operator())
+
         .def("validate", &GradingPrimary::validate, 
              DOC(GradingPrimary, validate))
 
@@ -142,6 +161,15 @@ void bindPyGradingData(py::module & m)
              "start"_a, "width"_a,
              DOC(GradingRGBMSW, GradingRGBMSW, 3))
 
+        .def("__eq__", [](const GradingRGBMSW &self, const GradingRGBMSW &other)
+            {
+                return self == other;
+            }, py::is_operator())
+        .def("__ne__", [](const GradingRGBMSW &self, const GradingRGBMSW &other)
+            {
+                return self != other;
+            }, py::is_operator())
+
         .def_readwrite("red", &GradingRGBMSW::m_red, 
                        DOC(GradingRGBMSW, m_red))
         .def_readwrite("green", &GradingRGBMSW::m_green, 
@@ -164,6 +192,15 @@ void bindPyGradingData(py::module & m)
         .def("validate", &GradingTone::validate, 
              DOC(GradingTone, validate))
 
+        .def("__eq__", [](const GradingTone &self, const GradingTone &other)
+            {
+                return self == other;
+            }, py::is_operator())
+        .def("__ne__", [](const GradingTone &self, const GradingTone &other)
+            {
+                return self != other;
+            }, py::is_operator())
+
         .def_readwrite("blacks", &GradingTone::m_blacks, 
                        DOC(GradingTone, m_blacks))
         .def_readwrite("whites", &GradingTone::m_whites, 
@@ -187,6 +224,15 @@ void bindPyGradingData(py::module & m)
              "y"_a = DEFAULT_CONTROL_POINT.m_y,
              DOC(GradingControlPoint, GradingControlPoint, 2))
 
+        .def("__eq__", [](const GradingControlPoint &self, const GradingControlPoint &other)
+            {
+                return self == other;
+            }, py::is_operator())
+        .def("__ne__", [](const GradingControlPoint &self, const GradingControlPoint &other)
+            {
+                return self != other;
+            }, py::is_operator())
+
         .def_readwrite("x", &GradingControlPoint::m_x, 
                        DOC(GradingControlPoint, m_x))
         .def_readwrite("y", &GradingControlPoint::m_y, 
@@ -224,6 +270,15 @@ void bindPyGradingData(py::module & m)
             }),
              DOC(GradingBSplineCurve, Create, 2))
 
+        .def("__eq__", [](const GradingBSplineCurve &self, const GradingBSplineCurve &other)
+            {
+                return self == other;
+            }, py::is_operator())
+        .def("__ne__", [](const GradingBSplineCurve &self, const GradingBSplineCurve &other)
+            {
+                return self != other;
+            }, py::is_operator())
+
         .def("validate", &GradingBSplineCurve::validate, 
              DOC(GradingBSplineCurve, validate))
         .def("setNumControlPoints", &GradingBSplineCurve::setNumControlPoints, "size"_a, 
@@ -281,6 +336,15 @@ void bindPyGradingData(py::module & m)
                          "master"_a = DEFAULT_RGB_CURVE->getCurve(RGB_MASTER),
                          DOC(GradingRGBCurve, GradingRGBCurve, 2))
 
+        .def("__eq__", [](const GradingRGBCurve &self, const GradingRGBCurve &other)
+            {
+                return self == other;
+            }, py::is_operator())
+        .def("__ne__", [](const GradingRGBCurve &self, const GradingRGBCurve &other)
+            {
+                return self != other;
+            }, py::is_operator())
+
        .def_property("red", 
                      [](const GradingRGBCurveRcPtr & rgbCurve)
             {
diff --git a/tests/cpu/ops/gradingprimary/GradingPrimary_tests.cpp b/tests/cpu/ops/gradingprimary/GradingPrimary_tests.cpp
index a9d0f3e641..9349a46d58 100644
--- a/tests/cpu/ops/gradingprimary/GradingPrimary_tests.cpp
+++ b/tests/cpu/ops/gradingprimary/GradingPrimary_tests.cpp
@@ -88,8 +88,16 @@ OCIO_ADD_TEST(GradingPrimary, precompute)
     OCIO_CHECK_ASSERT(comp.getContrast() == OCIO::Float3({ 1.f, 1.f, 1.f }));
     OCIO_CHECK_ASSERT(comp.getGamma() == OCIO::Float3({ 1.f, 1.f, 1.f }));
     OCIO_CHECK_CLOSE(comp.getPivot(), 0.4f, 1.e-6f);
+    OCIO_CHECK_ASSERT(comp.getLocalBypass());
     OCIO_CHECK_ASSERT(comp.isGammaIdentity());
 
+    gp.m_saturation = 0.5;
+    comp.update(OCIO::GRADING_LOG, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(!comp.getLocalBypass());
+    gp.m_saturation = 1.;
+    comp.update(OCIO::GRADING_LOG, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(comp.getLocalBypass());
+
     gp.m_brightness.m_green = 0.1 * 1023. / 6.25;
     comp.update(OCIO::GRADING_LOG, OCIO::TRANSFORM_DIR_FORWARD, gp);
     OCIO_CHECK_ASSERT(comp.getBrightness() == OCIO::Float3({ 0.f, 0.1f, 0.f }));
@@ -113,4 +121,36 @@ OCIO_ADD_TEST(GradingPrimary, precompute)
     OCIO_CHECK_ASSERT(comp.getBrightness() == OCIO::Float3({ -0.1f, 0.f, 0.f }));
     OCIO_CHECK_ASSERT(comp.getContrast() == OCIO::Float3({ 1.f, 0.8f, 1.f }));
     OCIO_CHECK_ASSERT(comp.getGamma() == OCIO::Float3({ 1.f, 1.f, 0.8f }));
+
+    gp = OCIO::GradingPrimary{ OCIO::GRADING_LOG };
+
+    // Test identity checks for GRADING_LOG
+    gp.m_gamma.m_red = 0.8;
+    comp.update(OCIO::GRADING_LOG, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(!comp.isGammaIdentity());
+    OCIO_CHECK_ASSERT(!comp.getLocalBypass());
+    gp.m_gamma.m_red = 1.0;
+    comp.update(OCIO::GRADING_LOG, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(comp.isGammaIdentity());
+    OCIO_CHECK_ASSERT(comp.getLocalBypass());
+
+    // Test identity checks for GRADING_LIN
+    gp.m_contrast.m_red = 0.8;
+    comp.update(OCIO::GRADING_LIN, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(!comp.isContrastIdentity());
+    OCIO_CHECK_ASSERT(!comp.getLocalBypass());
+    gp.m_contrast.m_red = 1.0;
+    comp.update(OCIO::GRADING_LIN, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(comp.isContrastIdentity());
+    OCIO_CHECK_ASSERT(comp.getLocalBypass());
+
+    // Test identity checks for GRADING_VIDEO
+    gp.m_gamma.m_red = 0.8;
+    comp.update(OCIO::GRADING_VIDEO, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(!comp.isGammaIdentity());
+    OCIO_CHECK_ASSERT(!comp.getLocalBypass());
+    gp.m_gamma.m_red = 1.0;
+    comp.update(OCIO::GRADING_VIDEO, OCIO::TRANSFORM_DIR_FORWARD, gp);
+    OCIO_CHECK_ASSERT(comp.isGammaIdentity());
+    OCIO_CHECK_ASSERT(comp.getLocalBypass());
 }
diff --git a/tests/python/GradingDataTest.py b/tests/python/GradingDataTest.py
index 7c376425c6..b015e53702 100644
--- a/tests/python/GradingDataTest.py
+++ b/tests/python/GradingDataTest.py
@@ -74,6 +74,13 @@ def test_rgbm(self):
         with self.assertRaises(TypeError):
             OCIO.GradingRGBM(master=.3, red=.4)
 
+        # Check comparison operators
+        rgbm1 = OCIO.GradingRGBM()
+        rgbm2 = OCIO.GradingRGBM()
+        self.assertEqual(rgbm1, rgbm2)
+        rgbm1.red = 2
+        self.assertNotEqual(rgbm1, rgbm2)
+
     def test_primary(self):
         """
         Test the GradingPrimary struct.
@@ -121,6 +128,13 @@ def test_primary(self):
         primaryLog.gamma = newGamma
         assertEqualRGBM(self, newGamma, primaryLog.gamma)
 
+        # Check comparison operators
+        primaryLog1 = OCIO.GradingPrimary(OCIO.GRADING_LOG)
+        primaryLog2 = OCIO.GradingPrimary(OCIO.GRADING_LOG)
+        self.assertEqual(primaryLog1, primaryLog2)
+        primaryLog1.saturation = 0.5
+        self.assertNotEqual(primaryLog1, primaryLog2)
+
     def test_bspline(self):
         """
         Test the GradingBSplineCurve: creation, control point modification, validation.
@@ -188,6 +202,19 @@ def test_bspline(self):
         cpts5[2] = OCIO.GradingControlPoint(y=0.6, x=0.4)
         assertEqualBSpline(self, bs, bs5)
 
+        # Check comparison operators
+        cpts1 = OCIO.GradingControlPoint(1, 1)
+        cpts2 = OCIO.GradingControlPoint(1, 1)
+        self.assertEqual(cpts1, cpts2)
+        cpts1.x = 0.5
+        self.assertNotEqual(cpts1, cpts2)
+
+        bs1 = OCIO.GradingBSplineCurve([0, 0, 0.1, 0.5, 0.4, 0.6, 0.6, 0.7, 1, 1])
+        bs2 = OCIO.GradingBSplineCurve([0, 0, 0.1, 0.5, 0.4, 0.6, 0.6, 0.7, 1, 1])
+        self.assertEqual(bs1, bs2)
+        bs1.getControlPoints()[2] = OCIO.GradingControlPoint(0.1, 0.4)
+        self.assertNotEqual(cpts1, cpts2)
+
     def test_rgbcurve(self):
         """
         Test the GradingRGBCurve, creation, default value, modification.
@@ -222,6 +249,13 @@ def test_rgbcurve(self):
         rgbVideo = OCIO.GradingRGBCurve(OCIO.GRADING_LOG)
         assertEqualRGBCurve(self, rgbLog, rgbVideo)
         
+        # Check comparison operators
+        rgbc1 = OCIO.GradingRGBCurve(OCIO.GRADING_LIN)
+        rgbc2 = OCIO.GradingRGBCurve(OCIO.GRADING_LIN)
+        self.assertEqual(rgbc1, rgbc2)
+        rgbc1.red.getControlPoints()[1] = OCIO.GradingControlPoint(0.4, 0.4)
+        self.assertNotEqual(rgbc1, rgbc2)
+
     def test_rgbmsw(self):
         """
         Test the GradingRGBMSW struct.
@@ -315,6 +349,13 @@ def test_rgbmsw(self):
         with self.assertRaises(TypeError):
             OCIO.GradingRGBMSW(green=3, start=4)
 
+        # Check comparison operators
+        rgbm1 = OCIO.GradingRGBMSW(1, 2, 3, 4, 5, 6)
+        rgbm2 = OCIO.GradingRGBMSW(1, 2, 3, 4, 5, 6)
+        self.assertEqual(rgbm1, rgbm2)
+        rgbm1.red = 2
+        self.assertNotEqual(rgbm1, rgbm2)
+
     def test_tone(self):
         """
         Test the GradingTone struct creation.
@@ -342,4 +383,11 @@ def test_tone(self):
 
         newMidtones = OCIO.GradingRGBMSW(1.1, 1.2, 1.3, 1, 0.2, 1.1)
         tone.midtones = newMidtones
-        assertEqualRGBM(self, newMidtones, tone.midtones)
\ No newline at end of file
+        assertEqualRGBM(self, newMidtones, tone.midtones)
+
+        # Check comparison operators
+        tone1 = OCIO.GradingTone(OCIO.GRADING_LOG)
+        tone2 = OCIO.GradingTone(OCIO.GRADING_LOG)
+        self.assertEqual(tone1, tone2)
+        tone1.midtones = newMidtones
+        self.assertNotEqual(tone1, tone2)
\ No newline at end of file
diff --git a/tests/python/GradingPrimaryTransformTest.py b/tests/python/GradingPrimaryTransformTest.py
index 074e750987..a4c80a1c90 100644
--- a/tests/python/GradingPrimaryTransformTest.py
+++ b/tests/python/GradingPrimaryTransformTest.py
@@ -122,8 +122,8 @@ def test_apply_dynamic(self):
         gpt1.makeDynamic()
         gpt2 = OCIO.GradingPrimaryTransform(OCIO.GRADING_LOG)
         val2 = OCIO.GradingPrimary(OCIO.GRADING_LOG)
-        val2.gamma = OCIO.GradingRGBM(1.2, 1.4, 1.1, 1.0);
-        val2.saturation = 1.5;
+        val2.gamma = OCIO.GradingRGBM(1.2, 1.4, 1.1, 1.0)
+        val2.saturation = 1.5
         gpt2.setValue(val2)
 
         group = OCIO.GroupTransform()
@@ -159,8 +159,8 @@ def test_apply_inverse(self):
 
         gpt = OCIO.GradingPrimaryTransform(OCIO.GRADING_LOG)
         val = OCIO.GradingPrimary(OCIO.GRADING_LOG)
-        val.gamma = OCIO.GradingRGBM(1.2, 1.4, 1.1, 0.7);
-        val.saturation = 1.5;
+        val.gamma = OCIO.GradingRGBM(1.2, 1.4, 1.1, 0.7)
+        val.saturation = 1.5
         gpt.setValue(val)
 
         cfg = OCIO.Config().CreateRaw()

From ffd943a1213a21e6f595083eeae3d0e898c95c6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 1 Feb 2023 09:34:47 -0500
Subject: [PATCH 54/81] Adding support for sse2neon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |   14 +
 ext/sse2neon/CMakeLists.txt                 |    9 +
 ext/sse2neon/src/include/sse2neon.h         | 9079 +++++++++++++++++++
 share/cmake/utils/CheckSupportARMNeon.cmake |   21 +
 share/cmake/utils/CheckSupportSSE2.cmake    |   35 +-
 share/cmake/utils/CompilerFlags.cmake       |   18 +-
 src/OpenColorIO/CMakeLists.txt              |   14 +
 src/OpenColorIO/SSE.h                       |    8 +-
 tests/cpu/CMakeLists.txt                    |   17 +
 9 files changed, 9199 insertions(+), 16 deletions(-)
 create mode 100644 ext/sse2neon/CMakeLists.txt
 create mode 100644 ext/sse2neon/src/include/sse2neon.h
 create mode 100644 share/cmake/utils/CheckSupportARMNeon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 932874f81c..a05df702b8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,6 +185,20 @@ message(STATUS "Checking for GPU configuration...")
 include(CheckSupportGL)
 
 
+###############################################################################
+# Check for ARM neon intrinsics (armv8)
+
+include(CheckSupportARMNeon)
+
+
+###############################################################################
+# Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
+
+if(HAVE_NEON)
+    add_subdirectory(ext/sse2neon)
+endif()
+
+
 ###############################################################################
 # Define compilation and link flags
 
diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
new file mode 100644
index 0000000000..7fd5fcb302
--- /dev/null
+++ b/ext/sse2neon/CMakeLists.txt
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+# sse2neon (modified)
+# https://github.com/DLTcollab/sse2neon
+add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+set_target_properties(sse2neon PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
+)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
new file mode 100644
index 0000000000..164c6c3387
--- /dev/null
+++ b/ext/sse2neon/src/include/sse2neon.h
@@ -0,0 +1,9079 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#else /* non-GNU / non-clang compilers */
+#warning "Macro name collisions may happen with unsupported compiler."
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(_WIN32)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+ * from both MinGW-w64 and MSVC.
+ */
+#define SSE2NEON_ALLOC_DEFINED
+#endif
+
+/* If using MSVC */
+#ifdef _MSC_VER
+#include <intrin.h>
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+    (defined(_M_ARM) || defined(__arm__))
+#define SSE2NEON_HAS_BITSCAN64
+#endif
+#endif
+
+/* Compiler barrier */
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+#if !defined(__aarch64__) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchtectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if !defined(__aarch64__)
+#include <sys/time.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+
+// __builtin_shuffle introduced in GCC 4.7.0
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+#define HAS__builtin_shuffle 1
+#else
+#define HAS__builtin_shuffle 0
+#endif
+
+#define HAS__builtin_shufflevector 0
+#define HAS__builtin_nontemporal_store 0
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif __has_builtin(__builtin_shuffle)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __extension__({                        \
+        type tmp = {__VA_ARGS__};          \
+        __builtin_shuffle(a, b, tmp);      \
+    })
+#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128();
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddv u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#else
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+#else
+// Wraps vaddvq_u8
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    return vaddvq_u8(a);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u16 variant */
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    uint32x4_t m = vpaddlq_u16(a);
+    uint64x2_t n = vpaddlq_u32(m);
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
+
+    return vget_lane_u32((uint32x2_t) o, 0);
+}
+#else
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+#if defined(__ARM_FEATURE_CRYPTO) && \
+    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+#endif
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+#if defined(aarch64__)
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+#else
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+#endif
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache heirarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr()
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                           \
+    __extension__({                                                        \
+        vreinterpret_m64_s16(vshuffle_s16(                                 \
+            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
+    })
+#else
+#define _mm_shuffle_pi16(a, imm)                                               \
+    __extension__({                                                            \
+        int16x4_t ret;                                                         \
+        ret =                                                                  \
+            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
+            1);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
+            2);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
+            3);                                                                \
+        vreinterpret_m64_s16(ret);                                             \
+    })
+#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 >= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 > *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 <= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 < *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
+                                                 vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double ret = *((double *) &a);
+    return (int32_t) ret;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+#else
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+#endif
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+#else
+    return _mm_set_pd(0, a);
+#endif
+}
+
+// Broadcast 16-bit integer a to all all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                         \
+    __extension__({                                                    \
+        int8x16_t ret;                                                 \
+        if (_sse2neon_unlikely(imm == 0))                              \
+            ret = vreinterpretq_s8_m128i(a);                           \
+        else if (_sse2neon_unlikely((imm) & ~15))                      \
+            ret = vdupq_n_s8(0);                                       \
+        else                                                           \
+            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
+                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
+        vreinterpretq_m128i_s8(ret);                                   \
+    })
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) == 0)) {                                \
+            ret = a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~15)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u16(                                   \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~31)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u32(                                   \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~63)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u64(                                   \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                       \
+    __extension__({                                                  \
+        int8x16_t ret;                                               \
+        if (_sse2neon_unlikely((imm) & ~15))                         \
+            ret = vdupq_n_s8(0);                                     \
+        else                                                         \
+            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
+                           (imm > 15 ? 0 : imm));                    \
+        vreinterpretq_m128i_s8(ret);                                 \
+    })
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+{
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (_sse2neon_unlikely((imm) >= 16)) {                              \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+#else
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
+    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(floor(f[1]), floor(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                              \
+    __extension__({                                                            \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
+                           vreinterpretq_f32_m128(a), 0);                      \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
+                           ((imm8 >> 4) & 0x3));                               \
+        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        vreinterpretq_m128_f32(                                                \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
+    })
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+#if defined(__aarch64__)
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+#else
+    // Find the minimum value
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+#endif
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+#if defined(__aarch64__)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t zf =
+        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t cf =
+        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t result = vandq_u64(zf, cf);
+    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+#if defined(__aarch64__)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#else
+        uint64x2_t sumh = vpaddlq_u32(vec_res);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+    case _SIDD_NEGATIVE_POLARITY:
+        res ^= 0xffffffff;
+        break;
+    case _SIDD_MASKED_NEGATIVE_POLARITY:
+        res ^= (1 << lb) - 1;
+        break;
+    default:
+        break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_clz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+#if _MSC_VER
+    unsigned long cnt;
+#ifdef defined(SSE2NEON_HAS_BITSCAN64)
+    (defined(_M_AMD64) || defined(__x86_64__))
+        if((_BitScanForward64(&cnt, x))
+            return (int)(cnt);
+#else
+    if (_BitScanForward(&cnt, (unsigned long) (x)))
+        return (int) cnt;
+    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+        return (int) (cnt + 32);
+#endif
+    return 64;
+#else
+    return x != 0 ? __builtin_ctzll(x) : 64;
+#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        } else {                                                               \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    } else {                                                                   \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        } else {                                                               \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        } else {                                                         \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+#if !defined(__aarch64__)
+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define SSE2NEON_MULTIPLY(x, y)                                  \
+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A implementation for a table-based AES */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+// muliplying 'x' by 2 in GF(2^8)
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+// muliplying 'x' by 3 in GF(2^8)
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+
+    // this generates a table containing every possible permutation of
+    // shift_rows() and sub_bytes() with mix_columns().
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
+    uint32_t x1 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
+    uint32_t x2 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
+    uint32_t x3 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
+
+    // finish the modulo addition step in mix_columns()
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // muliplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
+                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t i, e, f, g, h, v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    // inverse mix columns
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A implementation */
+    uint8_t v[16] = {
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+    };
+
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (int i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+#if defined(__aarch64__)
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+
+#else /* ARMv7-A NEON implementation */
+    uint8_t i, e, f, g, h, v[4][4];
+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+#endif
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+#if defined(__aarch64__)
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+
+#else /* ARMv7-A NEON implementation */
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+#endif
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
+        vreinterpretq_u8_m128i(RoundKey));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
\ No newline at end of file
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
new file mode 100644
index 0000000000..1123e75f52
--- /dev/null
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+include(CheckCXXSourceCompiles)
+
+set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
+set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+fp+simd+crypto+crc")
+
+check_cxx_source_compiles ("
+    #include <arm_neon.h>
+    int main()
+    {
+        float32x4_t v = vdupq_n_f32(0);
+        return 0;
+    }"
+    HAVE_NEON)
+
+set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
+unset(_cmake_required_flags_old)
+
+mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index f30bbb763c..5abd707cdb 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -4,6 +4,7 @@
 include(CheckCXXSourceCompiles)
 
 set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
+set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
@@ -16,20 +17,30 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     endif()
 endif()
 
-check_cxx_source_compiles ("
-    #include <emmintrin.h>
-    int main ()
-    {
-        __m128d a, b;
-        double vals[2] = {0};
-        a = _mm_loadu_pd (vals);
-        b = _mm_add_pd (a,a);
-        _mm_storeu_pd (vals,b);
-        return (0);
-    }"
-    HAVE_SSE2)
+set(_SSE2_HEADER "#include <emmintrin.h>")
+if (HAVE_NEON)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
+    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+    set(_SSE2_HEADER "#include <sse2neon.h>")
+endif()
+
+set(_SSE2_TEST_SOURCE_CODE "
+${_SSE2_HEADER}
+int main ()
+{
+    __m128d a, b;
+    double vals[2] = {0};
+    a = _mm_loadu_pd (vals);
+    b = _mm_add_pd (a,a);
+    _mm_storeu_pd (vals,b);
+    return (0);
+}")
+
+check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" HAVE_SSE2)
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
+set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
 unset(_cmake_required_flags_old)
+unset(_cmake_required_libraries_old)
 
 mark_as_advanced(HAVE_SSE2)
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index b11722f847..dc84eddf70 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -8,6 +8,20 @@
 set(PLATFORM_COMPILE_OPTIONS "")
 set(PLATFORM_LINK_OPTIONS "")
 
+###############################################################################
+# Define if SSE2 can be used.
+# Check for SSE2 first since some compile flags need to be set on Apple ARM.
+
+include(CheckSupportSSE2)
+
+if(NOT HAVE_SSE2)
+    message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
+    set(OCIO_USE_SSE OFF)
+endif(NOT HAVE_SSE2)
+
+###############################################################################
+# Compile flags
+
 if(USE_MSVC)
 
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/DUSE_MSVC")
@@ -40,7 +54,9 @@ elseif(USE_CLANG)
 
     # Use of 'register' specifier must be removed for C++17 support.
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-deprecated-register")
-
+    if (HAVE_SSE2 AND HAVE_NEON)
+        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS};-march=armv8-a+fp+simd+crypto+crc")
+    endif()
 elseif(USE_GCC)
 
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-DUSE_GCC")
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 6c2693db2f..bcfd356d53 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,6 +292,13 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
+if(OCIO_USE_SSE AND HAVE_NEON)
+    target_link_libraries(OpenColorIO
+        PRIVATE
+            "$<BUILD_INTERFACE:sse2neon>"
+    )
+endif()
+
 if(APPLE)
     target_link_libraries(OpenColorIO
         PRIVATE
@@ -327,6 +334,13 @@ if(OCIO_USE_SSE)
         PRIVATE
             USE_SSE
     )
+
+    if(HAVE_NEON)
+        target_compile_definitions(OpenColorIO
+            PRIVATE
+                USE_SSE2NEON
+        )
+    endif()
 endif()
 
 if(MSVC AND BUILD_TYPE_DEBUG AND BUILD_SHARED_LIBS)
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index a903120ec7..71b7faf6f9 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -8,11 +8,13 @@
 
 #ifdef USE_SSE
 
-
-#include <emmintrin.h>
+#ifndef USE_SSE2NEON
+    #include <emmintrin.h>
+#else
+    #include <sse2neon.h>
+#endif
 #include <stdio.h>
 
-
 #include <OpenColorIO/OpenColorIO.h>
 
 
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 431d570f4e..3d6cbc7506 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -26,6 +26,13 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
+    if(OCIO_USE_SSE AND HAVE_NEON)
+        target_link_libraries(${TEST_BINARY}
+            PRIVATE
+                sse2neon
+        )
+    endif()
+
     if(APPLE)
         # Frameworks needed to access the ICC monitor profile. 
         target_link_libraries(${TEST_BINARY}
@@ -43,12 +50,21 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
                 "${CMAKE_BINARY_DIR}/generated_include"
         )
     endif(PRIVATE_INCLUDES)
+
     if(OCIO_USE_SSE)
         target_compile_definitions(${TEST_BINARY}
             PRIVATE
                 USE_SSE
         )
+
+        if(HAVE_NEON)
+            target_compile_definitions(${TEST_BINARY}
+                PRIVATE
+                    USE_SSE2NEON
+            )
+        endif()
     endif(OCIO_USE_SSE)
+
     if(WIN32)
         # A windows application linking to eXpat static libraries must
         # have the global macro XML_STATIC defined
@@ -66,6 +82,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             )
         endif()
     endif(WIN32)
+
     set_target_properties(${TEST_BINARY} PROPERTIES
         COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
         LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}"

From 693769c1093b91d645639bbf0ef76b4322c3a069 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 9 Feb 2023 10:48:28 -0500
Subject: [PATCH 55/81] Overwriting sse2neon implementation for SSE2 _mm_max_ps
 and _mm_min_ps to fix issues with NaN handling. Fixing issues in the unit
 tests for some tests. For some tests, the neon implementation is matching the
 C++ implementation instead of the SSE2 implementation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/SSE.h                | 29 ++++++++++++++++++++++++++++
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 20 ++++++++++++++-----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 71b7faf6f9..567c5a1c64 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -22,6 +22,35 @@
 namespace OCIO_NAMESPACE
 {
 
+// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
+// it is redefining two of the functions from sse2neon.
+#ifdef USE_SSE2NEON
+    // Overwrite the translation of _mm_max_ps and _mm_min_ps.
+    // Using vmaxnmq_f32 and vminnmq_f32 instead.
+
+    // Compare packed single-precision (32-bit) floating-point elements in a and b,
+    // and store packed maximum values in dst. dst does not follow the IEEE Standard
+    // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+    // signed-zero values.
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+    static inline __m128 _mm_max_ps(__m128 a, __m128 b)
+    {
+        return vreinterpretq_m128_f32(
+            vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    }
+
+    // Compare packed single-precision (32-bit) floating-point elements in a and b,
+    // and store packed minimum values in dst. dst does not follow the IEEE Standard
+    // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+    // signed-zero values.
+    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+    static inline __m128 _mm_min_ps(__m128 a, __m128 b)
+    {
+        return vreinterpretq_m128_f32(
+            vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    }
+#endif
+
 // Macros for alignment declarations
 #define OCIO_SIMD_BYTES 16
 #if defined( _MSC_VER )
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 79649a23ae..7ac48ed1f6 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -130,7 +130,9 @@ void TestAntiLog(float logBase)
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[8], inf);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[8], inf);
+    #endif
 #else
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
@@ -138,7 +140,9 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
@@ -269,7 +273,9 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
 #ifdef USE_SSE
-    OCIO_CHECK_EQUAL(rgba[8], inf);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_EQUAL(rgba[8], inf);
+    #endif
 #else
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
@@ -280,7 +286,9 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
 #ifdef USE_SSE
-    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
@@ -579,7 +587,9 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6],  1.16f, 10.0f * error);
     OCIO_CHECK_EQUAL(rgba[8], -inf);
 #ifdef USE_SSE
-    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+    #ifndef USE_SSE2NEON
+        OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+    #endif
 #else
     OCIO_CHECK_EQUAL(rgba[9], inf);
 #endif

From d0b6c0dd0c1d5587c05bfa7a2105eb303efa17fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 10 Feb 2023 14:36:26 -0500
Subject: [PATCH 56/81] Changed how we handle sse2neon to match other
 dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                |   10 +-
 ext/sse2neon/CMakeLists.txt                   |    9 -
 ext/sse2neon/src/include/sse2neon.h           | 9079 -----------------
 share/cmake/modules/Findsse2neon.cmake        |   58 +
 .../modules/install/Installsse2neon.cmake     |   36 +
 src/OpenColorIO/CMakeLists.txt                |    2 +-
 6 files changed, 104 insertions(+), 9090 deletions(-)
 delete mode 100644 ext/sse2neon/CMakeLists.txt
 delete mode 100644 ext/sse2neon/src/include/sse2neon.h
 create mode 100644 share/cmake/modules/Findsse2neon.cmake
 create mode 100644 share/cmake/modules/install/Installsse2neon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a05df702b8..abe881c627 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -195,7 +195,15 @@ include(CheckSupportARMNeon)
 # Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
 
 if(HAVE_NEON)
-    add_subdirectory(ext/sse2neon)
+    # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
+    # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
+
+    # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
+    # recommended version branch is merged).
+    find_package(sse2neon QUIET)
+    if (NOT sse2neon_FOUND)
+        include(Installsse2neon)
+    endif()
 endif()
 
 
diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
deleted file mode 100644
index 7fd5fcb302..0000000000
--- a/ext/sse2neon/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-
-# sse2neon (modified)
-# https://github.com/DLTcollab/sse2neon
-add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-set_target_properties(sse2neon PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
-)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
deleted file mode 100644
index 164c6c3387..0000000000
--- a/ext/sse2neon/src/include/sse2neon.h
+++ /dev/null
@@ -1,9079 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// Contributors to this work are:
-//   John W. Ratcliff <jratcliffscarab@gmail.com>
-//   Brandon Rowlett <browlett@nvidia.com>
-//   Ken Fast <kfast@gdeb.com>
-//   Eric van Beurden <evanbeurden@nvidia.com>
-//   Alexander Potylitsin <apotylitsin@nvidia.com>
-//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
-//   Jim Huang <jserv@ccns.ncku.edu.tw>
-//   Mark Cheng <marktwtn@gmail.com>
-//   Malcolm James MacLeod <malcolm@gulden.com>
-//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
-//   Sebastian Pop <spop@amazon.com>
-//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
-//   Danila Kutenin <danilak@google.com>
-//   François Turban (JishinMaster) <francois.turban@gmail.com>
-//   Pei-Hsuan Hung <afcidk@gmail.com>
-//   Yang-Hao Yuan <yuanyanghau@gmail.com>
-//   Syoyo Fujita <syoyo@lighttransport.com>
-//   Brecht Van Lommel <brecht@blender.org>
-//   Jonathan Hue <jhue@adobe.com>
-//   Cuda Chen <clh960524@gmail.com>
-//   Aymen Qader <aymen.qader@arm.com>
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Tunable configurations */
-
-/* Enable precise implementation of math operations
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
- */
-/* _mm_min|max_ps|ss|pd|sd */
-#ifndef SSE2NEON_PRECISE_MINMAX
-#define SSE2NEON_PRECISE_MINMAX (0)
-#endif
-/* _mm_rcp_ps and _mm_div_ps */
-#ifndef SSE2NEON_PRECISE_DIV
-#define SSE2NEON_PRECISE_DIV (0)
-#endif
-/* _mm_sqrt_ps and _mm_rsqrt_ps */
-#ifndef SSE2NEON_PRECISE_SQRT
-#define SSE2NEON_PRECISE_SQRT (0)
-#endif
-/* _mm_dp_pd */
-#ifndef SSE2NEON_PRECISE_DP
-#define SSE2NEON_PRECISE_DP (0)
-#endif
-
-/* compiler specific definitions */
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
-#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
-#else /* non-GNU / non-clang compilers */
-#warning "Macro name collisions may happen with unsupported compiler."
-#ifndef FORCE_INLINE
-#define FORCE_INLINE static inline
-#endif
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#define _sse2neon_likely(x) (x)
-#define _sse2neon_unlikely(x) (x)
-#endif
-
-/* C language does not allow initializing a variable with a function call. */
-#ifdef __cplusplus
-#define _sse2neon_const static const
-#else
-#define _sse2neon_const const
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#if defined(_WIN32)
-/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
- * from both MinGW-w64 and MSVC.
- */
-#define SSE2NEON_ALLOC_DEFINED
-#endif
-
-/* If using MSVC */
-#ifdef _MSC_VER
-#include <intrin.h>
-#if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM) || defined(__arm__))
-#define SSE2NEON_HAS_BITSCAN64
-#endif
-#endif
-
-/* Compiler barrier */
-#define SSE2NEON_BARRIER()                     \
-    do {                                       \
-        __asm__ __volatile__("" ::: "memory"); \
-        (void) 0;                              \
-    } while (0)
-
-/* Memory barriers
- * __atomic_thread_fence does not include a compiler barrier; instead,
- * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
- * semantics.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-#include <stdatomic.h>
-#endif
-
-FORCE_INLINE void _sse2neon_smp_mb(void)
-{
-    SSE2NEON_BARRIER();
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
-    !defined(__STDC_NO_ATOMICS__)
-    atomic_thread_fence(memory_order_seq_cst);
-#elif defined(__GNUC__) || defined(__clang__)
-    __atomic_thread_fence(__ATOMIC_SEQ_CST);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
- */
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("fpu=neon")
-#endif
-#elif defined(__aarch64__)
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("+simd")
-#endif
-#elif __ARM_ARCH == 8
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error \
-    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#endif
-#else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
-#endif
-#endif
-
-#include <arm_neon.h>
-#if !defined(__aarch64__) && (__ARM_ARCH == 8)
-#if defined __has_include && __has_include(<arm_acle.h>)
-#include <arm_acle.h>
-#endif
-#endif
-
-/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
- * and other Arm microarchtectures use.
- * From sysctl -a on Apple M1:
- * hw.cachelinesize: 128
- */
-#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
-#define SSE2NEON_CACHELINE_SIZE 128
-#else
-#define SSE2NEON_CACHELINE_SIZE 64
-#endif
-
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
-#include <math.h>
-#endif
-
-/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
- * or even not accessible in user mode.
- * To write or access to these registers in user mode,
- * we have to perform syscall instead.
- */
-#if !defined(__aarch64__)
-#include <sys/time.h>
-#endif
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if defined(__GNUC__) && (__GNUC__ <= 9)
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-
-// __builtin_shuffle introduced in GCC 4.7.0
-#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
-#define HAS__builtin_shuffle 1
-#else
-#define HAS__builtin_shuffle 0
-#endif
-
-#define HAS__builtin_shufflevector 0
-#define HAS__builtin_nontemporal_store 0
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
-    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-#if __has_builtin(__builtin_shufflevector)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __builtin_shufflevector(a, b, __VA_ARGS__)
-#elif __has_builtin(__builtin_shuffle)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __extension__({                        \
-        type tmp = {__VA_ARGS__};          \
-        __builtin_shuffle(a, b, tmp);      \
-    })
-#endif
-
-#ifdef _sse2neon_shuffle
-#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
-#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
-#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
-#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
-#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
-#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
-#endif
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-#define _MM_FROUND_RAISE_EXC 0x00
-#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
-#define _MM_ROUND_NEAREST 0x0000
-#define _MM_ROUND_DOWN 0x2000
-#define _MM_ROUND_UP 0x4000
-#define _MM_ROUND_TOWARD_ZERO 0x6000
-/* Flush zero mode macros. */
-#define _MM_FLUSH_ZERO_MASK 0x8000
-#define _MM_FLUSH_ZERO_ON 0x8000
-#define _MM_FLUSH_ZERO_OFF 0x0000
-/* Denormals are zeros mode macros. */
-#define _MM_DENORMALS_ZERO_MASK 0x0040
-#define _MM_DENORMALS_ZERO_ON 0x0040
-#define _MM_DENORMALS_ZERO_OFF 0x0000
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef int64x1_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-// __int64 is defined in the Intrinsics Guide which maps to different datatype
-// in different data model
-#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
-#if (defined(__x86_64__) || defined(__i386__))
-#define __int64 long long
-#else
-#define __int64 int64_t
-#endif
-#endif
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
-#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64_s64(x) (x)
-
-#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
-#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
-#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
-
-#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64(x) (x)
-
-#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
-
-#if defined(__aarch64__)
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
-
-#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
-#define vreinterpretq_m128d_f64(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
-
-#define vreinterpretq_f64_m128d(x) (x)
-#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
-#else
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128d_f32(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_f32_m128d(x) (x)
-#endif
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an __m128 struct
-// directly.  It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://learn.microsoft.com/en-us/cpp/cpp/m128
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides.  This union should really only be used when
-// trying to access the members of the vector as integer values.  GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit.  If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components.  The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
-    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
-    int8_t m128_i8[16];    // as signed 8-bit integers.
-    int16_t m128_i16[8];   // as signed 16-bit integers.
-    int32_t m128_i32[4];   // as signed 32-bit integers.
-    int64_t m128_i64[2];   // as signed 64-bit integers.
-    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
-    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
-    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
-    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
-
-/* SSE macros */
-#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
-#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
-#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
-#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
-
-// Function declaration
-// SSE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
-FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
-FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
-FORCE_INLINE __m128 _mm_set_ps1(float);
-FORCE_INLINE __m128 _mm_setzero_ps(void);
-// SSE2
-FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_castps_si128(__m128);
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
-FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
-FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
-FORCE_INLINE __m128d _mm_set_pd(double, double);
-FORCE_INLINE __m128i _mm_set1_epi32(int);
-FORCE_INLINE __m128i _mm_setzero_si128();
-// SSE4.1
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
-FORCE_INLINE __m128 _mm_ceil_ps(__m128);
-FORCE_INLINE __m128d _mm_floor_pd(__m128d);
-FORCE_INLINE __m128 _mm_floor_ps(__m128);
-FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
-FORCE_INLINE __m128 _mm_round_ps(__m128, int);
-// SSE4.2
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__) &&                        \
-    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
-     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
-     (__GNUC__ <= 9 && defined(__aarch64__)))
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    uint8x16x4_t ret;
-    ret.val[0] = vld1q_u8(p + 0);
-    ret.val[1] = vld1q_u8(p + 16);
-    ret.val[2] = vld1q_u8(p + 32);
-    ret.val[3] = vld1q_u8(p + 48);
-    return ret;
-}
-#else
-// Wraps vld1q_u8_x4
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    return vld1q_u8_x4(p);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddv u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
-    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
-}
-#else
-// Wraps vaddv_u8
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    return vaddv_u8(v8);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
-    uint8_t res = 0;
-    for (int i = 0; i < 8; ++i)
-        res += tmp[i];
-    return res;
-}
-#else
-// Wraps vaddvq_u8
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    return vaddvq_u8(a);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u16 variant */
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    uint32x4_t m = vpaddlq_u16(a);
-    uint64x2_t n = vpaddlq_u32(m);
-    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
-
-    return vget_lane_u32((uint32x2_t) o, 0);
-}
-#else
-// Wraps vaddvq_u16
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    return vaddvq_u16(a);
-}
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- *   _mm_<name>_<data_type>
- *
- * The parts of this format are given as follows:
- * 1. <name> describes the operation performed by the intrinsic
- * 2. <data_type> identifies the data type of the function's primary arguments
- *
- * This last part, <data_type>, is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- *                      than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- *   // Set packed 8-bit integers
- *   // 128 bits, 16 chars, per 8 bits
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
- *   // Shuffle packed 8-bit integers
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- */
-
-/* Constants for use with _mm_prefetch. */
-enum _mm_hint {
-    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
-    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
-    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
-    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
-};
-
-// The bit field mapping to the FPCR(floating-point control register)
-typedef struct {
-    uint16_t res0;
-    uint8_t res1 : 6;
-    uint8_t bit22 : 1;
-    uint8_t bit23 : 1;
-    uint8_t bit24 : 1;
-    uint8_t res2 : 7;
-#if defined(__aarch64__)
-    uint32_t res3;
-#endif
-} fpcr_bitfield;
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-    float32x2_t a21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-    float32x2_t a03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-    float32x2_t a33 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
-{
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
-}
-
-#if defined(__ARM_FEATURE_CRYPTO) && \
-    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else  // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-                                    vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-                                    vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
-    uint8x16_t e =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
-    uint8x16_t f =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
-    uint8x16_t g =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
-    uint8x16_t h =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
-    uint8x16_t i =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
-    uint8x16_t j =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
-    uint8x16_t k =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f);  // L = E + F
-    uint8x16_t m = veorq_u8(g, h);  // M = G + H
-    uint8x16_t n = veorq_u8(i, j);  // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix = veorq_u8(d, cross1);
-    uint8x16_t r = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
-}
-#endif  // ARMv7 polyfill
-
-// C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
-//       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-#define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most significant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least significant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
-#else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
-#endif
-
-// NEON does not support a general purpose permute intrinsic.
-// Shuffle single-precision (32-bit) floating-point elements in a using the
-// control in imm8, and store the results in dst.
-//
-// C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
-//       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
-    __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
-    })
-
-// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
-// Store the results in the low 64 bits of dst, with the high 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
-#define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-        int16x4_t lowBits = vget_low_s16(ret);                                \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
-                             1);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
-                             2);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
-                             3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
-
-// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
-// Store the results in the high 64 bits of dst, with the low 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
-#define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-        int16x4_t highBits = vget_high_s16(ret);                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
-                             5);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
-                             6);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
-                             7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
-
-/* MMX */
-
-//_mm_empty is a no-op on arm
-FORCE_INLINE void _mm_empty(void) {}
-
-/* SSE */
-
-// Add packed single-precision (32-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Add the lower single-precision (32-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-    // the upper values in the result must be the remnants of <a>.
-    return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Compute the bitwise AND of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vbicq_s32(vreinterpretq_s32_m128(b),
-                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(
-        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmple_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
-//
-// See also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
-    // Note: NEON does not have ordered compare builtin
-    // Need to compare a eq a and b eq b to check for NaN
-    // Do AND of results to get final
-    uint32x4_t ceqaa =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t ceqbb =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-{
-    uint32x4_t f32a =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t f32b =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_eq_b =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_ge_b =
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_gt_b =
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_le_b =
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_le_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_lt_b =
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-    return !_mm_comieq_ss(a, b);
-}
-
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
-FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
-#else
-    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
-#endif
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
-                          0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int32_t) data;
-#endif
-}
-
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
-}
-
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then convert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
-}
-
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 16-bit integers, and store the results in dst. Note: this intrinsic
-// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
-// 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
-FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
-{
-    return vreinterpret_m64_s16(
-        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
-#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 8-bit integers, and store the results in lower 4 elements of dst.
-// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
-// between 0x7F and 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
-FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
-{
-    return vreinterpret_m64_s8(vqmovn_s16(
-        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
-}
-
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
-}
-
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_u32(
-        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
-#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
-
-// Convert the signed 64-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
-FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
-#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
-FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int64_t) data;
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
-FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
-{
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
-FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
-{
-    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
-#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
-#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
-FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
-{
-    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Divide packed single-precision (32-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#endif
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
-#endif
-}
-
-// Divide the lower single-precision (32-bit) floating-point element in a by the
-// lower single-precision (32-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper 3 packed elements from a to
-// the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
-#define _mm_extract_pi16(a, imm) \
-    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
-
-// Free aligned memory that was allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void _mm_free(void *addr)
-{
-    free(addr);
-}
-#endif
-
-// Macro: Get the flush zero bits from the MXCSR control and status register.
-// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
-// _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
-FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
-}
-
-// Macro: Get the rounding mode bits from the MXCSR control and status register.
-// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
-// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
-    }
-}
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm)                               \
-    __extension__({                                              \
-        vreinterpret_m64_s16(                                    \
-            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[63:32] := MEM[mem_addr+31:mem_addr]
-//   dst[95:64] := MEM[mem_addr+31:mem_addr]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
-
-// Load a single-precision (32-bit) floating-point element from memory into the
-// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
-}
-
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
-{
-    float32x4_t v = vrev64q_f32(vld1q_f32(p));
-    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
-}
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
-    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-    // equivalent for neon
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load unaligned 16-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
-{
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
-}
-
-// Load unaligned 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
-}
-
-// Allocate size bytes of memory, aligned to the alignment specified in align,
-// and return a pointer to the allocated memory. _mm_free should be used to free
-// memory that is allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-    void *ptr;
-    if (align == 1)
-        return malloc(size);
-    if (align == 2 || (sizeof(void *) == 8 && align == 4))
-        align = sizeof(void *);
-    if (!posix_memalign(&ptr, align, size))
-        return ptr;
-    return NULL;
-}
-#endif
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
-FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
-{
-    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x8_t masked =
-        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
-                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
-    vst1_s8((int8_t *) mem_addr, masked);
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
-#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed maximum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed minimum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the lower single-precision (32-bit) floating-point element from b to the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
-                       vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the upper 2 single-precision (32-bit) floating-point elements from b to
-// the lower 2 elements of dst, and copy the upper 2 elements from a to the
-// upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
-{
-#if defined(aarch64__)
-    return vreinterpretq_m128_u64(
-        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
-#else
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-#endif
-}
-
-// Move the lower 2 single-precision (32-bit) floating-point elements from b to
-// the upper 2 elements of dst, and copy the lower 2 elements from a to the
-// lower 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
-FORCE_INLINE int _mm_movemask_pi8(__m64 a)
-{
-    uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__)
-    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint8x8_t tmp = vshr_n_u8(input, 7);
-    return vaddv_u8(vshl_u8(tmp, shift));
-#else
-    // Refer the implementation of `_mm_movemask_epi8`
-    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
-    uint32x2_t paired16 =
-        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
-    uint8x8_t paired32 =
-        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
-    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
-#endif
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed single-precision (32-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-    uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
-    uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
-#else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Multiply packed single-precision (32-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_mul_ps(a, b));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(vshrn_n_u32(
-        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
-}
-
-// Compute the bitwise OR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
-#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
-#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
-#define _m_pmovmskb(a) _mm_movemask_pi8(a)
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
-// Fetch the line of data from memory that contains address p to a location in
-// the cache heirarchy specified by the locality hint i.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
-FORCE_INLINE void _mm_prefetch(char const *p, int i)
-{
-    switch (i) {
-    case _MM_HINT_NTA:
-        __builtin_prefetch(p, 0, 0);
-        break;
-    case _MM_HINT_T0:
-        __builtin_prefetch(p, 0, 3);
-        break;
-    case _MM_HINT_T1:
-        __builtin_prefetch(p, 0, 2);
-        break;
-    case _MM_HINT_T2:
-        __builtin_prefetch(p, 0, 1);
-        break;
-    }
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
-#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
-
-// Compute the approximate reciprocal of packed single-precision (32-bit)
-// floating-point elements in a, and store the results in dst. The maximum
-// relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#endif
-    return vreinterpretq_m128_f32(recip);
-}
-
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
-    return _mm_move_ss(a, _mm_rcp_ps(a));
-}
-
-// Compute the approximate reciprocal square root of packed single-precision
-// (32-bit) floating-point elements in a, and store the results in dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-#if SSE2NEON_PRECISE_SQRT
-    // Additional Netwon-Raphson iteration for accuracy
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-#endif
-    return vreinterpretq_m128_f32(out);
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
-{
-    uint64x1_t t = vpaddl_u32(vpaddl_u16(
-        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
-    return vreinterpret_m64_u16(
-        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
-}
-
-// Macro: Set the flush zero bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The flush zero may contain any of the
-// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
-FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Macro: Set the rounding mode bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The rounding mode may contain any of
-// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
-// _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
-        break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
-    }
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Set the MXCSR control and status register with the value in unsigned 32-bit
-// integer a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
-// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
-FORCE_INLINE void _mm_setcsr(unsigned int a)
-{
-    _MM_SET_ROUNDING_MODE(a);
-}
-
-// Get the unsigned 32-bit value of the MXCSR control and status register.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
-// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
-FORCE_INLINE unsigned int _mm_getcsr()
-{
-    return _MM_GET_ROUNDING_MODE();
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Return vector of type __m128 with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                           \
-    __extension__({                                                        \
-        vreinterpret_m64_s16(vshuffle_s16(                                 \
-            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
-            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
-    })
-#else
-#define _mm_shuffle_pi16(a, imm)                                               \
-    __extension__({                                                            \
-        int16x4_t ret;                                                         \
-        ret =                                                                  \
-            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
-            1);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
-            2);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
-            3);                                                                \
-        vreinterpret_m64_s16(ret);                                             \
-    })
-#endif
-
-// Perform a serializing operation on all store-to-memory instructions that were
-// issued prior to this instruction. Guarantees that every store instruction
-// that precedes, in program order, is globally visible before any store
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
-FORCE_INLINE void _mm_sfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory and store-to-memory
-// instructions that were issued prior to this instruction. Guarantees that
-// every memory access that precedes, in program order, the memory fence
-// instruction is globally visible before any memory instruction which follows
-// the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
-FORCE_INLINE void _mm_mfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory instructions that
-// were issued prior to this instruction. Guarantees that every load instruction
-// that precedes, in program order, is globally visible before any load
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
-FORCE_INLINE void _mm_lfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_ps(a, b, imm)                                              \
-    __extension__({                                                            \
-        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
-        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
-        float32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-        vreinterpretq_m128_f32(_shuf);                                         \
-    })
-#else  // generic
-#define _mm_shuffle_ps(a, b, imm)                          \
-    __extension__({                                        \
-        __m128 ret;                                        \
-        switch (imm) {                                     \
-        case _MM_SHUFFLE(1, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_1032((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 3, 0, 1):                      \
-            ret = _mm_shuffle_ps_2301((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 3, 2, 1):                      \
-            ret = _mm_shuffle_ps_0321((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 1, 0, 3):                      \
-            ret = _mm_shuffle_ps_2103((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 1, 0):                      \
-            ret = _mm_movelh_ps((a), (b));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_1001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 1, 0, 1):                      \
-            ret = _mm_shuffle_ps_0101((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 1, 0):                      \
-            ret = _mm_shuffle_ps_3210((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 1, 1):                      \
-            ret = _mm_shuffle_ps_0011((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 2, 2):                      \
-            ret = _mm_shuffle_ps_0022((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 2, 0, 0):                      \
-            ret = _mm_shuffle_ps_2200((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 0, 2):                      \
-            ret = _mm_shuffle_ps_3202((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 3, 2):                      \
-            ret = _mm_movehl_ps((b), (a));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 1, 3, 3):                      \
-            ret = _mm_shuffle_ps_1133((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 1, 0):                      \
-            ret = _mm_shuffle_ps_2010((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_2001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_2032((a), (b));           \
-            break;                                         \
-        default:                                           \
-            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-            break;                                         \
-        }                                                  \
-        ret;                                               \
-    })
-#endif
-
-// Compute the square root of packed single-precision (32-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if SSE2NEON_PRECISE_SQRT
-    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-
-    // Test for vrsqrteq_f32(0) -> positive infinity case.
-    // Change to zero, so that s * 1/sqrt(s) result is zero too.
-    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
-    const uint32x4_t div_by_zero =
-        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
-    recip = vreinterpretq_f32_u32(
-        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
-
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-
-    // sqrt(s) = s * 1/sqrt(s)
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
-#elif defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Compute the square root of the lower single-precision (32-bit) floating-point
-// element in a, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
-FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    vst1q_f32(p, vdupq_n_f32(a0));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
-#define _mm_store1_ps _mm_store_ps1
-
-// Store the upper 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_high_f32(a));
-}
-
-// Store the lower 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_low_f32(a));
-}
-
-// Store 4 single-precision (32-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
-FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
-{
-    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-    float32x4_t rev = vextq_f32(tmp, tmp, 2);
-    vst1q_f32(p, rev);
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores 16-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
-FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
-{
-    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
-}
-
-// Stores 64-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
-FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
-{
-    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
-}
-
-// Store 64-bits of integer data from a into memory using a non-temporal memory
-// hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
-FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
-{
-    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#else
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-#endif
-}
-
-// Subtract packed single-precision (32-bit) floating-point elements in b from
-// packed single-precision (32-bit) floating-point elements in a, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_sub_ps(a, b));
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-    do {                                                  \
-        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
-        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
-                            vget_low_f32(ROW23.val[0]));  \
-        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
-                            vget_low_f32(ROW23.val[1]));  \
-        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
-                            vget_high_f32(ROW23.val[0])); \
-        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
-                            vget_high_f32(ROW23.val[1])); \
-    } while (0)
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-// Return vector of type __m128i with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
-FORCE_INLINE __m128i _mm_undefined_si128(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128i a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Return vector of type __m128 with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
-FORCE_INLINE __m128 _mm_undefined_ps(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128 a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the high half a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-/* SSE2 */
-
-// Add packed 16-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed 32-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Add packed 64-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Add packed 8-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add the lower double-precision (64-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper element from
-// a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
-FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_add_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add 64-bit integers a and b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Add packed signed 16-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed unsigned 16-bit integers in a and b using saturation, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Add packed unsigned 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-{
-    // *NOTE* argument swap
-    return vreinterpretq_m128d_s64(
-        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
-}
-
-// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
-// AND with b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vbicq_s32(vreinterpretq_s32_m128i(b),
-                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
-#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
-#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
-
-// Cast vector of type __m128d to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
-FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
-{
-    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
-    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
-{
-    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
-FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
-#else
-    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
-#endif
-}
-
-// Cast vector of type __m128i to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Invalidate and flush the cache line that contains p from all levels of the
-// cache hierarchy.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
-#if defined(__APPLE__)
-#include <libkern/OSCacheControl.h>
-#endif
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-    (void) p;
-
-    /* sys_icache_invalidate is supported since macOS 10.5.
-     * However, it does not work on non-jailbroken iOS devices, although the
-     * compilation is successful.
-     */
-#if defined(__APPLE__)
-    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
-#elif defined(__GNUC__) || defined(__clang__)
-    uintptr_t ptr = (uintptr_t) p;
-    __builtin___clear_cache((char *) ptr,
-                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-// Compare packed 16-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 8-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
-FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
-FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
-FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
-FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
-FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
-FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
-FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
-FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmple_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
-FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
-FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
-FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
-FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
-FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
-FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
-FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
-FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
-FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
-FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
-FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
-FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
-FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Excluding NaNs, any two floating point numbers can be compared.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
-FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
-FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Two NaNs are not equal in comparison operation.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_s32(
-        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
-FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
-FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
-FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 > *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
-FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 <= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
-FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 < *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
-FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
-#else
-    uint32x4_t a_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
-    uint32x4_t b_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
-                                       vreinterpretq_u64_u32(a_eq_b));
-    return vgetq_lane_u64(and_results, 0) & 0x1;
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
-FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
-{
-    return !_mm_comieq_sd(a, b);
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
-FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
-#else
-    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
-FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
-{
-// vrnd32xq_f64 not supported on clang
-#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
-    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
-    int64x2_t integers = vcvtq_s64_f64(rounded);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
-FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
-{
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-{
-#if defined(__aarch64__)
-    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
-    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
-#else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
-FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
-#else
-    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
-    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__ARM_FEATURE_FRINT)
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST:
-        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-    case _MM_ROUND_DOWN:
-        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
-    case _MM_ROUND_UP:
-        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
-    }
-#else
-    float *f = (float *) &a;
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST: {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128i_s32(
-            vbslq_s32(is_delta_half, r_even, r_normal));
-    }
-    case _MM_ROUND_DOWN:
-        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
-                             floorf(f[0]));
-    case _MM_ROUND_UP:
-        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
-                             ceilf(f[0]));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
-                             (int32_t) f[0]);
-    }
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
-#else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
-#else
-    return ((double *) &a)[0];
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
-FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int32_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
-FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
-#define _mm_cvtsd_si64x _mm_cvtsd_si64
-
-// Convert the lower double-precision (64-bit) floating-point element in b to a
-// single-precision (32-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
-FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsetq_lane_f32(
-        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
-        vreinterpretq_f32_m128(a), 0));
-#else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
-#endif
-}
-
-// Copy the lower 32-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
-{
-    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Convert the signed 32-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
-FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Copy 32-bit integer a to the lower elements of dst, and zero the upper
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
-FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
-    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
-#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
-#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
-
-// Convert the lower single-precision (32-bit) floating-point element in b to a
-// double-precision (64-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
-FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
-{
-    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
-FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
-FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
-FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
-{
-    double ret = *((double *) &a);
-    return (int32_t) ret;
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
-FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
-#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
-// Divide packed double-precision (64-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
-FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Divide the lower double-precision (64-bit) floating-point element in a by the
-// lower double-precision (64-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper element from a to the upper
-// element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
-FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    float64x2_t tmp =
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
-#else
-    return _mm_move_sd(a, _mm_div_pd(a, b));
-#endif
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64(p));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
-#else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
-#endif
-}
-
-// Load 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
-    /* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
-#else
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-#endif
-}
-
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
-#if defined(__aarch64__)
-    float64x2_t v = vld1q_f64(p);
-    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
-#else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
-    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
-#endif
-}
-
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
-{
-    return _mm_load_pd(p);
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load unaligned 32-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
-{
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
-// 32-bit integers, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
-    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                              vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__)
-    int32x4_t high =
-        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
-
-    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
-#else
-    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                               vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-#endif
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint. mem_addr does not need to be aligned
-// on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
-FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
-{
-    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x16_t masked =
-        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
-                 vreinterpretq_s8_m128(b));
-    vst1q_s8((int8_t *) mem_addr, masked);
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed maximum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
-FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
-FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_max_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed minimum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
-FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
-FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_min_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
-}
-
-// Move the lower double-precision (64-bit) floating-point element from b to the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
-FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-    // Shift out everything but the sign bits with an unsigned shift right.
-    //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
-    //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
-    //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
-    //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
-    //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
-    //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed double-precision (64-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
-FORCE_INLINE int _mm_movemask_pd(__m128d a)
-{
-    uint64x2_t input = vreinterpretq_u64_m128d(a);
-    uint64x2_t high_bits = vshrq_n_u64(input, 63);
-    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-{
-    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-}
-
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
-    // vmull_u32 upcasts instead of masking, so we downcast.
-    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply packed double-precision (64-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
-FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Multiply the lower double-precision (64-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper element
-// from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
-FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_mul_pd(a, b));
-}
-
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u64(vget_low_u64(
-        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
-}
-
-// Multiply the packed signed 16-bit integers in a and b, producing intermediate
-// 32-bit integers, and store the high 16 bits of the intermediate integers in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-    /* FIXME: issue with large values because of result saturation */
-    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
-FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
-{
-    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__)
-    uint32x4_t ab7654 =
-        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
-                              vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r);
-#else
-    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-#endif
-}
-
-// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
-// integers, and store the low 16 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compute the bitwise OR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
-FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Pause the processor. This is typically used in spin-wait loops and depending
-// on the x86 processor typical values are in the 40-100 cycle range. The
-// 'yield' instruction isn't a good fit because it's effectively a nop on most
-// Arm cores. Experience with several databases has shown has shown an 'isb' is
-// a reasonable approximation.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
-FORCE_INLINE void _mm_pause()
-{
-    __asm__ __volatile__("isb\n");
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
-}
-
-// Set packed 16-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                   short i6,
-                                   short i5,
-                                   short i4,
-                                   short i3,
-                                   short i2,
-                                   short i1,
-                                   short i0)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-    return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
-{
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
-}
-
-// Set packed 8-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                  signed char b14,
-                                  signed char b13,
-                                  signed char b12,
-                                  signed char b11,
-                                  signed char b10,
-                                  signed char b9,
-                                  signed char b8,
-                                  signed char b7,
-                                  signed char b6,
-                                  signed char b5,
-                                  signed char b4,
-                                  signed char b3,
-                                  signed char b2,
-                                  signed char b1,
-                                  signed char b0)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-{
-    double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
-#define _mm_set_pd1 _mm_set1_pd
-
-// Copy double-precision (64-bit) floating-point element a to the lower element
-// of dst, and zero the upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
-FORCE_INLINE __m128d _mm_set_sd(double a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
-#else
-    return _mm_set_pd(0, a);
-#endif
-}
-
-// Broadcast 16-bit integer a to all all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Broadcast 32-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Broadcast 8-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
-    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
-FORCE_INLINE __m128d _mm_set1_pd(double d)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
-#endif
-}
-
-// Set packed 16-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                    short w1,
-                                    short w2,
-                                    short w3,
-                                    short w4,
-                                    short w5,
-                                    short w6,
-                                    short w7)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
-{
-    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
-}
-
-// Set packed 8-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                   signed char b1,
-                                   signed char b2,
-                                   signed char b3,
-                                   signed char b4,
-                                   signed char b5,
-                                   signed char b6,
-                                   signed char b7,
-                                   signed char b8,
-                                   signed char b9,
-                                   signed char b10,
-                                   signed char b11,
-                                   signed char b12,
-                                   signed char b13,
-                                   signed char b14,
-                                   signed char b15)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
-FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
-{
-    return _mm_set_pd(e0, e1);
-}
-
-// Return vector of type __m128d with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
-FORCE_INLINE __m128d _mm_setzero_pd(void)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
-#else
-    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
-#endif
-}
-
-// Return vector of type __m128i with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Shuffle 32-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_epi32(a, imm)                                            \
-    __extension__({                                                          \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
-        int32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                                      \
-    })
-#else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
-#endif
-
-// Shuffle double-precision (64-bit) floating-point elements using the control
-// in imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pd(a, b, imm8)                                            \
-    vreinterpretq_m128d_s64(                                                  \
-        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
-#else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflehi_epi16(a, imm)                                           \
-    __extension__({                                                           \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
-        int16x8_t _shuf =                                                     \
-            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-                          (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                                       \
-    })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflelo_epi16(a, imm)                                  \
-    __extension__({                                                  \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = vshuffleq_s16(                             \
-            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
-            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
-        vreinterpretq_m128i_s16(_shuf);                              \
-    })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shift packed 16-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
-    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
-    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
-    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
-FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~15))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
-}
-
-// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~31))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s32(
-        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
-}
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~63))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s64(
-        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                         \
-    __extension__({                                                    \
-        int8x16_t ret;                                                 \
-        if (_sse2neon_unlikely(imm == 0))                              \
-            ret = vreinterpretq_s8_m128i(a);                           \
-        else if (_sse2neon_unlikely((imm) & ~15))                      \
-            ret = vdupq_n_s8(0);                                       \
-        else                                                           \
-            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
-                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
-        vreinterpretq_m128i_s8(ret);                                   \
-    })
-
-// Compute the square root of packed double-precision (64-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
-FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Compute the square root of the lower double-precision (64-bit) floating-point
-// element in b, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
-FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_sqrt_pd(b));
-#else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
-#endif
-}
-
-// Shift packed 16-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in sign
-// bits, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-{
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
-}
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) == 0)) {                                \
-            ret = a;                                                         \
-        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 16-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
-    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
-    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
-    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~15)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u16(                                   \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~31)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u32(                                   \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~63)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u64(                                   \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
-#define _mm_srli_si128(a, imm)                                       \
-    __extension__({                                                  \
-        int8x16_t ret;                                               \
-        if (_sse2neon_unlikely((imm) & ~15))                         \
-            ret = vdupq_n_s8(0);                                     \
-        else                                                         \
-            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
-                           (imm > 15 ? 0 : imm));                    \
-        vreinterpretq_m128i_s8(ret);                                 \
-    })
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
-FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
-    vst1q_f64((float64_t *) mem_addr,
-              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
-#else
-    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
-    vst1q_f32((float32_t *) mem_addr,
-              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
-FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
-#define _mm_store1_pd _mm_store_pd1
-
-// Store the upper double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
-FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 64-bit integer from the first element of a into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
-    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
-FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 2 double-precision (64-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
-FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
-{
-    float32x4_t f = vreinterpretq_f32_m128d(a);
-    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
-{
-    _mm_store_pd(mem_addr, a);
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store 32-bit integer from the first element of a into memory. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
-FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
-{
-    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory using a non-temporal memory hint. mem_addr must
-// be aligned on a 16-byte boundary or a general-protection exception may be
-// generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
-FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#elif defined(__aarch64__)
-    vst1q_f64(p, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory using a non-temporal memory
-// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
-// exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, p);
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Store 32-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
-FORCE_INLINE void _mm_stream_si32(int *p, int a)
-{
-    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
-}
-
-// Store 64-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
-FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
-{
-    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
-}
-
-// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed double-precision (64-bit) floating-point elements in b from
-// packed double-precision (64-bit) floating-point elements in a, and store the
-// results in dst.
-//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
-FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Subtract the lower double-precision (64-bit) floating-point element in b from
-// the lower double-precision (64-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
-FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_sub_pd(a, b));
-}
-
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-#define _mm_ucomieq_sd _mm_comieq_sd
-#define _mm_ucomige_sd _mm_comige_sd
-#define _mm_ucomigt_sd _mm_comigt_sd
-#define _mm_ucomile_sd _mm_comile_sd
-#define _mm_ucomilt_sd _mm_comilt_sd
-#define _mm_ucomineq_sd _mm_comineq_sd
-
-// Return vector of type __m128d with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
-FORCE_INLINE __m128d _mm_undefined_pd(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128d a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the high half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the high half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
-FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
-                     vget_high_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
-FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
-                     vget_low_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-/* SSE3 */
-
-// Alternatively add and subtract packed double-precision (64-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
-FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
-{
-    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
-                                             vreinterpretq_f64_m128d(b),
-                                             vreinterpretq_f64_m128d(mask)));
-#else
-    return _mm_add_pd(_mm_mul_pd(b, mask), a);
-#endif
-}
-
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-{
-    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
-                                            vreinterpretq_f32_m128(mask),
-                                            vreinterpretq_f32_m128(b)));
-#else
-    return _mm_add_ps(_mm_mul_ps(b, mask), a);
-#endif
-}
-
-// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
-FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of double-precision (64-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
-{
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
-#else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
-{
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
-#else
-    float32x4x2_t c = vuzpq_f32(a, b);
-    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
-}
-
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
-
-// Duplicate the low double-precision (64-bit) floating-point element from a,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
-FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_u64(
-        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
-#endif
-}
-
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
-#else
-    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
-    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
-    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-/* SSSE3 */
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
-    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
-    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-{
-    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-{
-    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
-    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
-
-// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 16 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
-        __m128i ret;                                                          \
-        if (_sse2neon_unlikely((imm) & ~31))                                  \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
-        else if (imm >= 16)                                                   \
-            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
-        else                                                                  \
-            ret =                                                             \
-                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
-        ret;                                                                  \
-    })
-
-// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 8 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
-#define _mm_alignr_pi8(a, b, imm)                                           \
-    __extension__({                                                         \
-        __m64 ret;                                                          \
-        if (_sse2neon_unlikely((imm) >= 16)) {                              \
-            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
-        } else {                                                            \
-            uint8x8_t tmp_low, tmp_high;                                    \
-            if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
-                tmp_low = vreinterpret_u8_m64(a);                           \
-                tmp_high = vdup_n_u8(0);                                    \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            } else {                                                        \
-                const int idx = (imm);                                      \
-                tmp_low = vreinterpret_u8_m64(b);                           \
-                tmp_high = vreinterpret_u8_m64(a);                          \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            }                                                               \
-        }                                                                   \
-        ret;                                                                \
-    })
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
-#else
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s32(
-        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
-FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t res = vuzp_s16(a, b);
-    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
-#else
-    int32x4x2_t c = vuzpq_s32(a, b);
-    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
-FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
-FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
-#else
-    int32x2x2_t c = vuzp_s32(a, b);
-    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
-FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                             vmovl_s8(vget_low_s8(b)));
-    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
-                             vmovl_s8(vget_high_s8(b)));
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
-#else
-    // This would be much simpler if x86 would choose to zero extend OR sign
-    // extend, not both. This could probably be optimized better.
-    uint16x8_t a = vreinterpretq_u16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // Zero extend a
-    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-    int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-    // multiply
-    int16x8_t prod1 = vmulq_s16(a_even, b_even);
-    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
-// pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
-FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
-{
-    uint16x4_t a = vreinterpret_u16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // Zero extend a
-    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
-    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
-    int16x4_t b_odd = vshr_n_s16(b, 8);
-
-    // multiply
-    int16x4_t prod1 = vmul_s16(a_even, b_even);
-    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
-    // Has issues due to saturation
-    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-    // Multiply
-    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    // Rounding narrowing shift right
-    // narrow = (int16_t)((mul + 16384) >> 15);
-    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-    // Join together
-    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Truncate each intermediate integer to the 18 most
-// significant bits, round by adding 1, and store bits [16:1] to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
-FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
-{
-    int32x4_t mul_extend =
-        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
-
-    // Rounding narrowing shift right
-    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
-    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
-    uint8x16_t idx_masked =
-        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-    int8x16_t ret;
-    // %e and %f represent the even and odd D registers
-    // respectively.
-    __asm__ __volatile__(
-        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-        : [ret] "=&w"(ret)
-        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-    return vreinterpretq_m128i_s8(ret);
-#else
-    // use this line if testing on aarch64
-    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
-FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
-{
-    const int8x8_t controlMask =
-        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
-    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
-    return vreinterpret_m64_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
-#else
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
-    // 'a') based on ltMask
-    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x8_t res = vbicq_s16(masked, zeroMask);
-    return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
-#else
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
-    // 'a') based on ltMask
-    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x4_t res = vbicq_s32(masked, zeroMask);
-    return vreinterpretq_m128i_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
-    int8x16_t a = vreinterpretq_s8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
-#else
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
-    // based on ltMask
-    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x16_t res = vbicq_s8(masked, zeroMask);
-
-    return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
-
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
-#else
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
-    // based on ltMask
-    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x4_t res = vbic_s16(masked, zeroMask);
-
-    return vreinterpret_m64_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
-#else
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
-    // based on ltMask
-    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x2_t res = vbic_s32(masked, zeroMask);
-
-    return vreinterpret_m64_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-{
-    int8x8_t a = vreinterpret_s8_m64(_a);
-    int8x8_t b = vreinterpret_s8_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
-#else
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
-    // based on ltMask
-    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x8_t res = vbic_s8(masked, zeroMask);
-
-    return vreinterpret_m64_s8(res);
-}
-
-/* SSE4.1 */
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                            \
-    __extension__({                                                           \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
-    })
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using control mask imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
-#define _mm_blend_pd(a, b, imm)                                \
-    __extension__({                                            \
-        const uint64_t _mask[2] = {                            \
-            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
-            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
-        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
-        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
-        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
-        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
-    })
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
-FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
-{
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
-    uint32x4_t mask = vld1q_u32(data);
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint8x16_t mask =
-        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    uint8x16_t b = vreinterpretq_u8_m128i(_b);
-    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
-FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
-{
-    uint64x2_t mask =
-        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
-#else
-    uint64x2_t a = vreinterpretq_u64_m128d(_a);
-    uint64x2_t b = vreinterpretq_u64_m128d(_b);
-    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
-#endif
-}
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint32x4_t mask =
-        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a up
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b up to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
-FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_ceil_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b up to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
-FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_ceil_ps(b));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
-    // ARMv7 lacks vceqq_u64
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
-    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
-// integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_u32(
-        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
-    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_u64(
-        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
-    return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
-// 64-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Conditionally multiply the packed double-precision (64-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products, and
-// conditionally store the sum in dst using the low 4 bits of imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
-FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
-{
-    // Generate mask value from constant immediate bit value
-    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
-    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
-#if !SSE2NEON_PRECISE_DP
-    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
-    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
-#endif
-    // Conditional multiplication
-#if !SSE2NEON_PRECISE_DP
-    __m128d mul = _mm_mul_pd(a, b);
-    const __m128d mulMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
-    __m128d tmp = _mm_and_pd(mul, mulMask);
-#else
-#if defined(__aarch64__)
-    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
-                             : 0;
-    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
-                             : 0;
-#else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
-#endif
-    __m128d tmp = _mm_set_pd(d1, d0);
-#endif
-    // Sum the products
-#if defined(__aarch64__)
-    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
-#else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
-#endif
-    // Conditionally store the sum
-    const __m128d sumMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
-    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
-    return res;
-}
-
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-{
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
-    if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
-    }
-#endif
-
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
-
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
-    if (imm & (1 << 4))
-        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
-    if (imm & (1 << 5))
-        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
-    if (imm & (1 << 6))
-        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
-    if (imm & (1 << 7))
-        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
-    };
-    return vreinterpretq_m128_f32(res);
-}
-
-// Extract a 32-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extract a 64-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Extract an 8-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
-// __constrange(0,16) int imm)
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Round the packed double-precision (64-bit) floating-point elements in a down
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
-FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b down to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
-FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_floor_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b down to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
-FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_floor_ps(b));
-}
-
-// Copy a to dst, and insert the 32-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the 64-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
-// location specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
-
-// Copy a to tmp, then insert a single-precision (32-bit) floating-point
-// element from b into tmp using the control in imm8. Store tmp to dst using
-// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                              \
-    __extension__({                                                            \
-        float32x4_t tmp1 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
-                           vreinterpretq_f32_m128(a), 0);                      \
-        float32x4_t tmp2 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
-                           ((imm8 >> 4) & 0x3));                               \
-        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
-        uint32x4_t mask = vld1q_u32(data);                                     \
-        float32x4_t all_zeros = vdupq_n_f32(0);                                \
-                                                                               \
-        vreinterpretq_m128_f32(                                                \
-            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
-    })
-
-// Compare packed signed 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
-FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
-FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
-FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
-    __m128i dst;
-    uint16_t min, idx = 0;
-#if defined(__aarch64__)
-    // Find the minimum value
-    min = vminvq_u16(vreinterpretq_u16_m128i(a));
-
-    // Get the index of the minimum value
-    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint16x8_t minv = vdupq_n_u16(min);
-    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
-    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
-#else
-    // Find the minimum value
-    __m64 tmp;
-    tmp = vreinterpret_m64_u16(
-        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-                 vget_high_u16(vreinterpretq_u16_m128i(a))));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-    // Get the index of the minimum value
-    int i;
-    for (i = 0; i < 8; i++) {
-        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-            idx = (uint16_t) i;
-            break;
-        }
-        a = _mm_srli_si128(a, 2);
-    }
-#endif
-    // Generate result
-    dst = _mm_setzero_si128();
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-    return dst;
-}
-
-// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
-// 8-bit integers in a compared to those in b, and store the 16-bit results in
-// dst. Eight SADs are performed using one quadruplet from b and eight
-// quadruplets from a. One quadruplet is selected from b starting at on the
-// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
-// integers selected from a starting at the offset specified in imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
-FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
-{
-    uint8x16_t _a, _b;
-
-    switch (imm & 0x4) {
-    case 0:
-        // do nothing
-        _a = vreinterpretq_u8_m128i(a);
-        break;
-    case 4:
-        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
-                                            vreinterpretq_u32_m128i(a), 1));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    switch (imm & 0x3) {
-    case 0:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
-        break;
-    case 1:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
-        break;
-    case 2:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
-        break;
-    case 3:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    int16x8_t c04, c15, c26, c37;
-    uint8x8_t low_b = vget_low_u8(_b);
-    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
-    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
-    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
-    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
-    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
-    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
-    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__)
-    // |0|4|2|6|
-    c04 = vpaddq_s16(c04, c26);
-    // |1|5|3|7|
-    c15 = vpaddq_s16(c15, c37);
-
-    int32x4_t trn1_c =
-        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    int32x4_t trn2_c =
-        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
-                                              vreinterpretq_s16_s32(trn2_c)));
-#else
-    int16x4_t c01, c23, c45, c67;
-    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
-    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
-    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
-    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
-
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
-#endif
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
-    // vmull_s32 upcasts instead of masking, so we downcast.
-    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
-// integers, and store the low 32 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
-FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
-{
-#if defined(__aarch64__)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_pd(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_pd(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
-    }
-#else
-    double *v_double = (double *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        double res[2], tmp;
-        for (int i = 0; i < 2; i++) {
-            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
-            double roundDown = floor(tmp);  // Round down value
-            double roundUp = ceil(tmp);     // Round up value
-            double diffDown = tmp - roundDown;
-            double diffUp = roundUp - tmp;
-            if (diffDown < diffUp) {
-                /* If it's closer to the round down value, then use it */
-                res[i] = roundDown;
-            } else if (diffDown > diffUp) {
-                /* If it's closer to the round up value, then use it */
-                res[i] = roundUp;
-            } else {
-                /* If it's equidistant between round up and round down value,
-                 * pick the one which is an even number */
-                double half = roundDown / 2;
-                if (half != floor(half)) {
-                    /* If the round down value is odd, return the round up value
-                     */
-                    res[i] = roundUp;
-                } else {
-                    /* If the round up value is odd, return the round down value
-                     */
-                    res[i] = roundDown;
-                }
-            }
-            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
-        }
-        return _mm_set_pd(res[1], res[0]);
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_pd(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_pd(a);
-    }
-    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
-                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
-// floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_ps(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_ps(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
-    }
-#else
-    float *v_float = (float *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128_f32(
-            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_ps(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_ps(a);
-    }
-    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
-                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
-                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
-                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b using
-// the rounding parameter, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
-FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
-{
-    return _mm_move_sd(a, _mm_round_pd(b, rounding));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b using
-// the rounding parameter, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst. Rounding is done according to the
-// rounding[3:0] parameter, which can be one of:
-//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
-//     exceptions
-//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
-//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
-//     _MM_SET_ROUNDING_MODE
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
-FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
-{
-    return _mm_move_ss(a, _mm_round_ps(b, rounding));
-}
-
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    return __builtin_nontemporal_load(p);
-#else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
-#endif
-}
-
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
-{
-    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
-    int64x2_t a_and_mask =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
-    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
-// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
-// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
-FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
-{
-    uint64x2_t zf =
-        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t cf =
-        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t result = vandq_u64(zf, cf);
-    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the CF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
-FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
-#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the ZF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
-FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-/* SSE4.2 */
-
-const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-
-/* specify the source data format */
-#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
-#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
-#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
-#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
-
-/* specify the comparison operation */
-#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
-#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
-#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
-#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
-
-/* specify the polarity */
-#define _SIDD_POSITIVE_POLARITY 0x00
-#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
-#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
-#define _SIDD_MASKED_NEGATIVE_POLARITY \
-    0x30 /* negate results only before end of string */
-
-/* specify the output selection in _mm_cmpXstri */
-#define _SIDD_LEAST_SIGNIFICANT 0x00
-#define _SIDD_MOST_SIGNIFICANT 0x40
-
-/* specify the output selection in _mm_cmpXstrm */
-#define _SIDD_BIT_MASK 0x00
-#define _SIDD_UNIT_MASK 0x40
-
-/* Pattern Matching for C macros.
- * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
- */
-
-/* catenate */
-#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
-#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
-
-#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
-/* run the 2nd parameter */
-#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
-/* run the 1st parameter */
-#define SSE2NEON_IIF_1(t, ...) t
-
-#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
-#define SSE2NEON_COMPL_0 1
-#define SSE2NEON_COMPL_1 0
-
-#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
-#define SSE2NEON_DEC_1 0
-#define SSE2NEON_DEC_2 1
-#define SSE2NEON_DEC_3 2
-#define SSE2NEON_DEC_4 3
-#define SSE2NEON_DEC_5 4
-#define SSE2NEON_DEC_6 5
-#define SSE2NEON_DEC_7 6
-#define SSE2NEON_DEC_8 7
-#define SSE2NEON_DEC_9 8
-#define SSE2NEON_DEC_10 9
-#define SSE2NEON_DEC_11 10
-#define SSE2NEON_DEC_12 11
-#define SSE2NEON_DEC_13 12
-#define SSE2NEON_DEC_14 13
-#define SSE2NEON_DEC_15 14
-#define SSE2NEON_DEC_16 15
-
-/* detection */
-#define SSE2NEON_CHECK_N(x, n, ...) n
-#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
-#define SSE2NEON_PROBE(x) x, 1,
-
-#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
-#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
-
-#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
-#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
-
-#define SSE2NEON_EAT(...)
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
-
-/* recursion */
-/* deferred expression */
-#define SSE2NEON_EMPTY()
-#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
-#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-
-#define SSE2NEON_EVAL(...) \
-    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
-#define SSE2NEON_EVAL1(...) \
-    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
-#define SSE2NEON_EVAL2(...) \
-    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
-#define SSE2NEON_EVAL3(...) __VA_ARGS__
-
-#define SSE2NEON_REPEAT(count, macro, ...)                         \
-    SSE2NEON_WHEN(count)                                           \
-    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
-        SSE2NEON_DEC(count), macro,                                \
-        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
-                                              __VA_ARGS__))
-#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
-
-#define SSE2NEON_SIZE_OF_byte 8
-#define SSE2NEON_NUMBER_OF_LANES_byte 16
-#define SSE2NEON_SIZE_OF_word 16
-#define SSE2NEON_NUMBER_OF_LANES_word 8
-
-#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
-    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
-        vreinterpretq_##type##_m128i(a)));
-
-#define SSE2NEON_FILL_LANE(i, type) \
-    vec_b[i] =                      \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
-
-#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
-                       number_of_lanes, byte_or_word)                         \
-    do {                                                                      \
-        SSE2NEON_CAT(                                                         \
-            data_type_prefix,                                                 \
-            SSE2NEON_CAT(size,                                                \
-                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
-        vec_b[number_of_lanes];                                               \
-        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
-            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
-            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
-                                      SSE2NEON_CAT(type_prefix, size)))       \
-        for (int i = 0; i < number_of_lanes; i++) {                           \
-            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
-                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
-                SSE2NEON_CAT(vreinterpretq_u,                                 \
-                             SSE2NEON_CAT(size, _m128i))(mask),               \
-                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a))))),        \
-                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
-        }                                                                     \
-    } while (0)
-
-#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
-    do {                                                                     \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
-                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
-                                      SSE2NEON_CAT(u, size)))                \
-    } while (0)
-
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
-    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
-                                                int lb)                       \
-    {                                                                         \
-        __m128i mtx[16];                                                      \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
-        return SSE2NEON_CAT(                                                  \
-            _sse2neon_aggregate_equal_any_,                                   \
-            SSE2NEON_CAT(                                                     \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
-                                             type))))(la, lb, mtx);           \
-    }
-
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
-    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
-                                                 int lb)                       \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_RANGES(                                                        \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_ranges_,                                       \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
-                                             type))))(la, lb, mtx);            \
-    }
-
-#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
-                                                    __m128i b, int lb)         \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_equal_ordered_,                                \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x,                                                \
-                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
-    }
-
-static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
-    prefix##IMPL(byte) \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
-
-static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        __m128i tmp = vreinterpretq_m128i_u32(
-            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
-        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
-                                       vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__)
-        int t = vaddvq_u32(vec_res) ? 1 : 0;
-#else
-        uint64x2_t sumh = vpaddlq_u32(vec_res);
-        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
-#endif
-        res |= (t << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        __m128i tmp = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
-        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
-                                       vreinterpretq_u16_m128i(tmp));
-        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
-        res |= (t << j);
-    }
-    return res;
-}
-
-#define SSE2NEON_CMP_RANGES_IS_BYTE 1
-#define SSE2NEON_CMP_RANGES_IS_WORD 0
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
-    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
-    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
-    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
-    prefix##IMPL(word, int, s, prefix##IS_WORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
-
-#undef SSE2NEON_CMP_RANGES_IS_BYTE
-#undef SSE2NEON_CMP_RANGES_IS_WORD
-
-static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint8x16_t mtx =
-        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x10000 - (1 << la);
-    int tb = 0x10000 - (1 << lb);
-    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
-    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
-    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
-
-    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
-    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
-    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
-    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
-    res_lo = vand_u8(res_lo, vec_mask);
-    res_hi = vand_u8(res_hi, vec_mask);
-
-    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
-    return res;
-}
-
-static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint16x8_t mtx =
-        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x100 - (1 << la);
-    int tb = 0x100 - (1 << lb);
-    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
-    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
-    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
-    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
-    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
-    mtx = vbslq_u16(vec1, tmp, mtx);
-    mtx = vandq_u16(mtx, vec_mask);
-    return _sse2neon_vaddvq_u16(mtx);
-}
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
-        int bound, int la, int lb, __m128i mtx[16])                            \
-    {                                                                          \
-        int res = 0;                                                           \
-        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
-        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
-            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
-            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
-        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
-                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
-            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
-        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
-        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
-        for (int j = 0; j < lb; j++) {                                         \
-            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
-                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
-        }                                                                      \
-        for (int j = lb; j < bound; j++) {                                     \
-            mtx[j] = vreinterpretq_m128i_u##size(                              \
-                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
-        }                                                                      \
-        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
-            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
-        for (int i = 0; i < bound; i++) {                                      \
-            int val = 1;                                                       \
-            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
-                val &= ptr[k * bound + j];                                     \
-            res += val << i;                                                   \
-        }                                                                      \
-        return res;                                                            \
-    }
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
-    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
-    prefix##IMPL(16, 8, prefix##IS_UWORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
-
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
-    prefix##IMPL(byte)                              \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
-
-#define SSE2NEON_CMPESTR_LIST                          \
-    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
-    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
-    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
-    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
-    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
-    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
-
-enum {
-#define _(name, func_suffix) name,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
-static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
-#define _(name, func_suffix) _sse2neon_##func_suffix,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-
-FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
-{
-    switch (imm8 & 0x30) {
-    case _SIDD_NEGATIVE_POLARITY:
-        res ^= 0xffffffff;
-        break;
-    case _SIDD_MASKED_NEGATIVE_POLARITY:
-        res ^= (1 << lb) - 1;
-        break;
-    default:
-        break;
-    }
-
-    return res & ((bound == 8) ? 0xFF : 0xFFFF);
-}
-
-FORCE_INLINE int _sse2neon_clz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanForward(&cnt, x))
-        return cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_clz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanReverse(&cnt, x))
-        return 31 - cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_ctz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
-{
-#if _MSC_VER
-    unsigned long cnt;
-#ifdef defined(SSE2NEON_HAS_BITSCAN64)
-    (defined(_M_AMD64) || defined(__x86_64__))
-        if((_BitScanForward64(&cnt, x))
-            return (int)(cnt);
-#else
-    if (_BitScanForward(&cnt, (unsigned long) (x)))
-        return (int) cnt;
-    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
-        return (int) (cnt + 32);
-#endif
-    return 64;
-#else
-    return x != 0 ? __builtin_ctzll(x) : 64;
-#endif
-}
-
-#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
-
-#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = (imm & 0x01) ? 8 : 16
-
-#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
-    int tmp1 = la ^ (la >> 31);                  \
-    la = tmp1 - (la >> 31);                      \
-    int tmp2 = lb ^ (lb >> 31);                  \
-    lb = tmp2 - (lb >> 31);                      \
-    la = SSE2NEON_MIN(la, bound);                \
-    lb = SSE2NEON_MIN(lb, bound)
-
-// Compare all pairs of character in string a and b,
-// then aggregate the result.
-// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
-// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
-// string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
-    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
-    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
-
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
-    return (r2 == 0) ? bound                                     \
-                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                      : _sse2neon_ctz(r2))
-
-#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
-    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if (imm8 & 0x40) {                                                         \
-        if (bound == 8) {                                                      \
-            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
-                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
-            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
-                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
-        } else {                                                               \
-            uint8x16_t vec_r2 =                                                \
-                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
-            uint8x16_t tmp =                                                   \
-                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
-        }                                                                      \
-    } else {                                                                   \
-        if (bound == 16) {                                                     \
-            dst = vreinterpretq_m128i_u16(                                     \
-                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
-        } else {                                                               \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
-        }                                                                      \
-    }                                                                          \
-    return dst
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and returns 1 if b did not contain a null character and the
-// resulting mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
-FORCE_INLINE int _mm_cmpestra(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    int lb_cpy = lb;
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return !r2 & (lb_cpy > bound);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
-FORCE_INLINE int _mm_cmpestrc(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
-FORCE_INLINE int _mm_cmpestri(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
-FORCE_INLINE __m128i
-_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
-FORCE_INLINE int _mm_cmpestro(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
-FORCE_INLINE int _mm_cmpestrs(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
-FORCE_INLINE int _mm_cmpestrz(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return lb <= (bound - 1);
-}
-
-#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
-    do {                                                                 \
-        if (imm8 & 0x01) {                                               \
-            uint16x8_t equal_mask_##str =                                \
-                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
-        } else {                                                         \
-            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
-                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
-        }                                                                \
-    } while (0)
-
-#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
-    int la, lb;                                  \
-    do {                                         \
-        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
-        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
-    } while (0)
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if b did not contain a null character and the resulting
-// mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
-FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return !r2 & (lb >= bound);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
-FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
-FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
-FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
-FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
-FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int la;
-    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
-FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int lb;
-    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
-    return lb <= (bound - 1);
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    return vreinterpretq_m128i_s64(vshrq_n_s64(
-        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
-        63));
-#endif
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32ch(crc, v);
-#else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cw(crc, v);
-#else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cb(crc, v);
-#else
-    crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
-#endif
-    return crc;
-}
-
-/* AES */
-
-#if !defined(__ARM_FEATURE_CRYPTO)
-/* clang-format off */
-#define SSE2NEON_AES_SBOX(w)                                           \
-    {                                                                  \
-        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
-        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
-        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
-        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
-        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
-        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
-        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
-        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
-        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
-        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
-        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
-        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
-        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
-        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
-        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
-        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
-        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
-        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
-        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
-        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
-        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
-        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
-        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
-        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
-        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
-        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
-        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
-        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
-        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
-        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
-        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
-        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
-        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
-        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
-        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
-        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
-        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
-    }
-#define SSE2NEON_AES_RSBOX(w)                                          \
-    {                                                                  \
-        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
-        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
-        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
-        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
-        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
-        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
-        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
-        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
-        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
-        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
-        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
-        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
-        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
-        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
-        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
-        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
-        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
-        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
-        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
-        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
-        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
-        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
-        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
-        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
-        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
-        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
-        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
-        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
-        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
-        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
-        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
-        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
-        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
-        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
-        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
-        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
-        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
-    }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2NEON_AES_H0(x) (x)
-static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
-static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
-#undef SSE2NEON_AES_H0
-
-/* x_time function and matrix multiply function */
-#if !defined(__aarch64__)
-#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-#define SSE2NEON_MULTIPLY(x, y)                                  \
-    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
-     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
-     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
-     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
-#endif
-
-// In the absence of crypto extensions, implement aesenc using regular NEON
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// for more information.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    /* shift rows */
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    /* sub bytes */
-    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-    // look up each of the table. After each lookup, we load the next table
-    // which locates at the next 64-bytes. In the meantime, the index in the
-    // table would be smaller than it was, so the index parameters of
-    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    /* mix columns */
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    /* add round key */
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A implementation for a table-based AES */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
-    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
-     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
-// muliplying 'x' by 2 in GF(2^8)
-#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-// muliplying 'x' by 3 in GF(2^8)
-#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
-#define SSE2NEON_AES_U0(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
-#define SSE2NEON_AES_U1(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
-#define SSE2NEON_AES_U2(p) \
-    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
-#define SSE2NEON_AES_U3(p) \
-    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
-
-    // this generates a table containing every possible permutation of
-    // shift_rows() and sub_bytes() with mix_columns().
-    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
-    };
-#undef SSE2NEON_AES_B2W
-#undef SSE2NEON_AES_F2
-#undef SSE2NEON_AES_F3
-#undef SSE2NEON_AES_U0
-#undef SSE2NEON_AES_U1
-#undef SSE2NEON_AES_U2
-#undef SSE2NEON_AES_U3
-
-    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
-    uint32_t x1 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
-    uint32_t x2 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
-    uint32_t x3 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
-
-    // finish the modulo addition step in mix_columns()
-    __m128i out = _mm_set_epi32(
-        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
-         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
-        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
-         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
-        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
-         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
-        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
-         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, RoundKey);
-#endif
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // inverse mix columns
-    // muliplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    // add round key
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t i, e, f, g, h, v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    // inverse mix columns
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    // sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A implementation */
-    uint8_t v[16] = {
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-    };
-
-    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (int i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-#if defined(__aarch64__)
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-    uint8x16_t v = vreinterpretq_u8_m128i(a);
-    uint8x16_t w;
-
-    // multiplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    // multiplying 'v' by 2 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-    return vreinterpretq_m128i_u8(w);
-
-#else /* ARMv7-A NEON implementation */
-    uint8_t i, e, f, g, h, v[4][4];
-    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
-#endif
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-//
-// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
-// This instruction generates a round key for AES encryption. See
-// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
-// for details.
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-#if defined(__aarch64__)
-    uint8x16_t _a = vreinterpretq_u8_m128i(a);
-    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
-
-    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
-    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
-    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
-
-    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
-
-#else /* ARMv7-A NEON implementation */
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
-    for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
-    }
-    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-#endif
-}
-#undef SSE2NEON_AES_SBOX
-#undef SSE2NEON_AES_RSBOX
-
-#if defined(__aarch64__)
-#undef SSE2NEON_XT
-#undef SSE2NEON_MULTIPLY
-#endif
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-        vreinterpretq_u8_m128i(b));
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(veorq_u8(
-        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-        vreinterpretq_u8_m128i(RoundKey)));
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-                         RoundKey);
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(
-        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
-        vreinterpretq_u8_m128i(RoundKey));
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst."
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-    // AESE does ShiftRows and SubBytes on A
-    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-
-    uint8x16_t dest = {
-        // Undo ShiftRows step from AESE and extract X1 and X3
-        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
-        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
-        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
-    };
-    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
-    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
-}
-#endif
-
-/* Others */
-
-// Perform a carry-less multiplication of two 64-bit integers, selected from a
-// and b according to imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
-    uint64x2_t a = vreinterpretq_u64_m128i(_a);
-    uint64x2_t b = vreinterpretq_u64_m128i(_b);
-    switch (imm & 0x11) {
-    case 0x00:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-    case 0x01:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-    case 0x10:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-    case 0x11:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
-    default:
-        abort();
-    }
-}
-
-FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
-}
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(a);
-#else
-    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
-    uint32_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-
-    vst1_u32(&count, count32x2_val);
-    return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(a);
-#else
-    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
-    uint64_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-    uint64x1_t count64x1_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-    count64x1_val = vpaddl_u32(count32x2_val);
-    vst1_u64(&count, count64x1_val);
-    return count;
-#endif
-}
-
-FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Return the current 64-bit value of the processor's time-stamp counter.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
-FORCE_INLINE uint64_t _rdtsc(void)
-{
-#if defined(__aarch64__)
-    uint64_t val;
-
-    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
-     * system counter is at least 56 bits wide; from Armv8.6, the counter
-     * must be 64 bits wide.  So the system counter could be less than 64
-     * bits wide and it is attributed with the flag 'cap_user_time_short'
-     * is true.
-     */
-    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
-
-    return val;
-#else
-    uint32_t pmccntr, pmuseren, pmcntenset;
-    // Read the user mode Performance Monitoring Unit (PMU)
-    // User Enable Register (PMUSERENR) access permissions.
-    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
-    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
-        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
-        if (pmcntenset & 0x80000000UL) {  // Is it counting?
-            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
-            // The counter is set up to count every 64th cycle
-            return (uint64_t) (pmccntr) << 6;
-        }
-    }
-
-    // Fallback to syscall as we can't enable PMUSERENR in user mode.
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
-#endif
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC pop_options
-#endif
-
-#endif
\ No newline at end of file
diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
new file mode 100644
index 0000000000..7dea6740cb
--- /dev/null
+++ b/share/cmake/modules/Findsse2neon.cmake
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Locate sse2neon (header-only version)
+#
+# Variables defined by this module:
+#   sse2neon_FOUND          - Indicate whether the library was found or not
+#   sse2neon_INCLUDE_DIR    - Location of the header files
+#
+# Global targets defined by this module:
+#   sse2neon
+###############################################################################
+### Try to find package ###
+
+if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
+    # Find include directory
+    find_path(sse2neon_INCLUDE_DIR
+        NAMES
+            sse2neon.h
+        HINTS
+            ${sse2neon_ROOT}
+        PATH_SUFFIXES
+            include
+            sse2neon/include
+    )
+
+    # Override REQUIRED if package can be installed
+    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
+        set(sse2neon_FIND_REQUIRED FALSE)
+    endif()
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(sse2neon
+        REQUIRED_VARS 
+            sse2neon_INCLUDE_DIR 
+    )
+    set(sse2neon_FOUND ${sse2neon_FOUND})
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(sse2neon_FOUND AND NOT TARGET sse2neon)
+    # INTERFACE type since we know that this is a header-only library.
+    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+    set(_sse2neon_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_sse2neon_TARGET_CREATE)
+    set_target_properties(sse2neon PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(sse2neon_INCLUDE_DIR)
+endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake
new file mode 100644
index 0000000000..16b47798cd
--- /dev/null
+++ b/share/cmake/modules/install/Installsse2neon.cmake
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Install sse2neon (header-only version)
+# https://github.com/DLTcollab/sse2neon
+#
+#
+# Global targets defined by this module:
+#   sse2neon
+###############################################################################
+
+# Download sse2neon using FetchContent and make it available at configure time.
+
+include(FetchContent)
+
+set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon")
+FetchContent_Declare(sse2neon
+  GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
+  GIT_TAG        v1.6.0
+)
+
+# FetchContent_MakeAvailable is not available until CMake 3.14+.
+# Using FetchContent_GetProperties and FetchContent_Populate instead.
+FetchContent_GetProperties(sse2neon)
+
+if(NOT sse2neon_POPULATED)
+  FetchContent_Populate(sse2neon)
+
+  set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}")
+  file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon")
+
+  add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+  set_target_properties(sse2neon PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${_EXT_DIST_INCLUDE}/sse2neon"
+  )
+endif()
\ No newline at end of file
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index bcfd356d53..a68d5c3676 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -295,7 +295,7 @@ target_link_libraries(OpenColorIO
 if(OCIO_USE_SSE AND HAVE_NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
-            "$<BUILD_INTERFACE:sse2neon>"
+            sse2neon
     )
 endif()
 

From 1adf419f68fb7b4512f3537ffe1a762c4fcfae20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 10 Feb 2023 14:45:24 -0500
Subject: [PATCH 57/81] Fix comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index abe881c627..3951a3f450 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,7 @@ include(CheckSupportARMNeon)
 
 
 ###############################################################################
-# Add sse2neon to the build since CompilerFlags needs to know if SSE2 is supported.
+# Add sse2neon to the build if ARM NEON intrinsics are supported.
 
 if(HAVE_NEON)
     # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is

From b5f760e4a7a498c6d2b306e6fc7af9fcfeaf41d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Tue, 14 Feb 2023 10:47:42 -0500
Subject: [PATCH 58/81] Re-wording comments Now check for OCIO_USE_SSE and
 HAVE_NEON before integrating sse2neon to the build.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt        |  2 +-
 src/OpenColorIO/SSE.h | 25 +++++++++++--------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3951a3f450..81ad855fc9 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,7 +194,7 @@ include(CheckSupportARMNeon)
 ###############################################################################
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
-if(HAVE_NEON)
+if(HAVE_NEON AND OCIO_USE_SSE)
     # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 567c5a1c64..f63e08cdef 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -24,26 +24,23 @@ namespace OCIO_NAMESPACE
 
 // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
 // it is redefining two of the functions from sse2neon.
+
 #ifdef USE_SSE2NEON
-    // Overwrite the translation of _mm_max_ps and _mm_min_ps.
-    // Using vmaxnmq_f32 and vminnmq_f32 instead.
-
-    // Compare packed single-precision (32-bit) floating-point elements in a and b,
-    // and store packed maximum values in dst. dst does not follow the IEEE Standard
-    // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-    // signed-zero values.
-    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+    // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
+    // NaN handling.
+
+    // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
+    // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
+    // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
+    // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
+    // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
+    // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
+    // the first argument continues to be filtered out.
     static inline __m128 _mm_max_ps(__m128 a, __m128 b)
     {
         return vreinterpretq_m128_f32(
             vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
     }
-
-    // Compare packed single-precision (32-bit) floating-point elements in a and b,
-    // and store packed minimum values in dst. dst does not follow the IEEE Standard
-    // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-    // signed-zero values.
-    // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
     static inline __m128 _mm_min_ps(__m128 a, __m128 b)
     {
         return vreinterpretq_m128_f32(

From cc24983f8b553d563f47ddddd26ccbb1ba6d9fc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 15 Feb 2023 13:44:25 -0500
Subject: [PATCH 59/81] Removing specifics compiler options for NEON since they
 are not needed on Apple and might interfere with optimization if not careful.
 Updated CheckSupportSSE2.cmake and adding some checks in SSE.h to support
 universal build. Added comments and new define (USING_INTEL_SSE,
 USING_ARM_NEON and USING_CPP) in LogOpCPU_tests which should help with
 understand what is going on. Checking for SSE2 and ARM Neon only when
 OCIO_USE_SSE is ON.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |   6 +-
 share/cmake/utils/CheckSupportARMNeon.cmake |  30 ++--
 share/cmake/utils/CheckSupportSSE2.cmake    |  73 +++++++---
 share/cmake/utils/CompilerFlags.cmake       |  10 +-
 src/OpenColorIO/CMakeLists.txt              |  16 ++-
 src/OpenColorIO/SSE.h                       |  61 ++++----
 tests/cpu/CMakeLists.txt                    |  16 ++-
 tests/cpu/ops/log/LogOpCPU_tests.cpp        | 151 +++++++++++++-------
 8 files changed, 238 insertions(+), 125 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81ad855fc9..82ff26bcd1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,9 +186,11 @@ include(CheckSupportGL)
 
 
 ###############################################################################
-# Check for ARM neon intrinsics (armv8)
+# Check for ARM neon (if the user asked for it)
 
-include(CheckSupportARMNeon)
+if(OCIO_USE_SSE)
+    include(CheckSupportARMNeon)
+endif()
 
 
 ###############################################################################
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
index 1123e75f52..acab5e5946 100644
--- a/share/cmake/utils/CheckSupportARMNeon.cmake
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -1,21 +1,27 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright Contributors to the OpenColorIO Project.
 
+# Checks for ARM NEON availability
+
 include(CheckCXXSourceCompiles)
 
-set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
-set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+fp+simd+crypto+crc")
+set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+
+if(APPLE)
+    set(CMAKE_OSX_ARCHITECTURES "arm64")
+endif()
+
+set(source_code "
+#include <arm_neon.h>
+int main()
+{
+    float32x4_t v = vdupq_n_f32(0);
+    return 0;
+}")
 
-check_cxx_source_compiles ("
-    #include <arm_neon.h>
-    int main()
-    {
-        float32x4_t v = vdupq_n_f32(0);
-        return 0;
-    }"
-    HAVE_NEON)
+check_cxx_source_compiles ("${source_code}" HAVE_NEON)
 
-set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
-unset(_cmake_required_flags_old)
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
 
+unset(_cmake_osx_architectures_old)
 mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 5abd707cdb..311dbdbc87 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -6,6 +6,8 @@ include(CheckCXXSourceCompiles)
 set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
 set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
 
+set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
     # unrelated warnings causing a detection failure. So, the code disables all warnings to focus
@@ -17,30 +19,63 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     endif()
 endif()
 
-set(_SSE2_HEADER "#include <emmintrin.h>")
-if (HAVE_NEON)
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -march=armv8-a+fp+simd+crypto+crc")
-    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-    set(_SSE2_HEADER "#include <sse2neon.h>")
-endif()
 
-set(_SSE2_TEST_SOURCE_CODE "
-${_SSE2_HEADER}
-int main ()
-{
-    __m128d a, b;
-    double vals[2] = {0};
-    a = _mm_loadu_pd (vals);
-    b = _mm_add_pd (a,a);
-    _mm_storeu_pd (vals,b);
-    return (0);
-}")
+macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
+    set(_SSE2_TEST_SOURCE_CODE "
+    ${_check_sse2_header_}
+    int main ()
+    {
+        __m128d a, b;
+        double vals[2] = {0};
+        a = _mm_loadu_pd (vals);
+        b = _mm_add_pd (a,a);
+        _mm_storeu_pd (vals,b);
+        return (0);
+    }")
+
+    check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" ${_check_output_var_name_})
+    mark_as_advanced(${_check_output_var_name_})
+endmacro()
+
+if(NOT HAVE_NEON)
+    check_arm_neon_availability("#include <emmintrin.h>" HAVE_SSE2)
+elseif(APPLE)
+    # Test for both supported architectures
+    # x86_64 and arm64
+    set(ARCHITECTURES_LIST "arm64;x86_64")
+    
+    message(STATUS "Checking SSE2 support using SSE2NEON library for arm64 and x86_64 architectures")
+    foreach (current_arch IN LISTS ARCHITECTURES_LIST)
+
+        set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
+        
+        if(current_arch STREQUAL arm64)
+            if (HAVE_NEON)
+                set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+                set(_sse2_header_ "#include <sse2neon.h>")
 
-check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" HAVE_SSE2)
+                set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
+            endif()
+        elseif(current_arch STREQUAL x86_64)
+            set(_sse2_header_ "#include <emmintrin.h>")
+            set(_output_var_name_ "HAVE_SSE2")
+        endif()
+
+        check_arm_neon_availability("${_sse2_header_}" ${_output_var_name_})
+
+    endforeach()
+elseif(NOT APPLE)
+    # Other arm platform
+    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+    check_arm_neon_availability("#include <sse2neon.h>" HAVE_SSE2)
+endif()
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
 set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+
 unset(_cmake_required_flags_old)
 unset(_cmake_required_libraries_old)
+unset(_cmake_osx_architectures_old)
+
 
-mark_as_advanced(HAVE_SSE2)
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index dc84eddf70..5420b495d4 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -10,14 +10,16 @@ set(PLATFORM_LINK_OPTIONS "")
 
 ###############################################################################
 # Define if SSE2 can be used.
-# Check for SSE2 first since some compile flags need to be set on Apple ARM.
 
-include(CheckSupportSSE2)
+# Check for SSE2 only if the user asked for it.
+if(OCIO_USE_SSE)
+    include(CheckSupportSSE2)
+endif()
 
-if(NOT HAVE_SSE2)
+if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
     set(OCIO_USE_SSE OFF)
-endif(NOT HAVE_SSE2)
+endif()
 
 ###############################################################################
 # Compile flags
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index a68d5c3676..5fe522b0d5 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,7 +292,7 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
-if(OCIO_USE_SSE AND HAVE_NEON)
+if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
             sse2neon
@@ -330,15 +330,17 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX))
 endif()
 
 if(OCIO_USE_SSE)
-    target_compile_definitions(OpenColorIO
-        PRIVATE
-            USE_SSE
-    )
+    if(HAVE_SSE2)
+        target_compile_definitions(OpenColorIO
+            PRIVATE
+                USE_SSE
+        )
+    endif()
 
-    if(HAVE_NEON)
+    if(HAVE_SSE2_WITH_SSE2NEON)
         target_compile_definitions(OpenColorIO
             PRIVATE
-                USE_SSE2NEON
+                USE_SSE2_WITH_SSE2NEON
         )
     endif()
 endif()
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index f63e08cdef..3c40a858e1 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,14 +6,23 @@
 #define INCLUDED_OCIO_SSE_H
 
 
+#if !defined(USE_SSE)
+    #define USING_CPP 1
+#endif
+
 #ifdef USE_SSE
 
-#ifndef USE_SSE2NEON
+// If it is not arm64, same behavior as before.
+#if !defined(__aarch64__)
     #include <emmintrin.h>
-#else
-    #include <sse2neon.h>
+    #define USING_INTEL_SSE2 1
+#elif defined(__aarch64__)
+    // ARM architecture A64 (ARM64)
+    #if defined(USE_SSE2_WITH_SSE2NEON)
+        #include <sse2neon.h>
+        #define USING_INTEL_SSE2_WITH_SSE2NEON 1
+    #endif
 #endif
-#include <stdio.h>
 
 #include <OpenColorIO/OpenColorIO.h>
 
@@ -25,27 +34,29 @@ namespace OCIO_NAMESPACE
 // Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
 // it is redefining two of the functions from sse2neon.
 
-#ifdef USE_SSE2NEON
-    // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
-    // NaN handling.
-
-    // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
-    // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
-    // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
-    // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
-    // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
-    // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
-    // the first argument continues to be filtered out.
-    static inline __m128 _mm_max_ps(__m128 a, __m128 b)
-    {
-        return vreinterpretq_m128_f32(
-            vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    }
-    static inline __m128 _mm_min_ps(__m128 a, __m128 b)
-    {
-        return vreinterpretq_m128_f32(
-            vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    }
+#if defined(__aarch64__)
+    #if defined(USE_SSE2_WITH_SSE2NEON)
+        // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
+        // NaN handling.
+
+        // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
+        // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 
+        // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the 
+        // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as 
+        // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument 
+        // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in 
+        // the first argument continues to be filtered out.
+        static inline __m128 _mm_max_ps(__m128 a, __m128 b)
+        {
+            return vreinterpretq_m128_f32(
+                vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+        }
+        static inline __m128 _mm_min_ps(__m128 a, __m128 b)
+        {
+            return vreinterpretq_m128_f32(
+                vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+        }
+    #endif
 #endif
 
 // Macros for alignment declarations
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 3d6cbc7506..53f50d1031 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -26,7 +26,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
-    if(OCIO_USE_SSE AND HAVE_NEON)
+    if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
         target_link_libraries(${TEST_BINARY}
             PRIVATE
                 sse2neon
@@ -52,15 +52,17 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     endif(PRIVATE_INCLUDES)
 
     if(OCIO_USE_SSE)
-        target_compile_definitions(${TEST_BINARY}
-            PRIVATE
-                USE_SSE
-        )
+        if (HAVE_SSE2)
+            target_compile_definitions(${TEST_BINARY}
+                PRIVATE
+                    USE_SSE
+            )
+        endif()
 
-        if(HAVE_NEON)
+        if(HAVE_SSE2_WITH_SSE2NEON)
             target_compile_definitions(${TEST_BINARY}
                 PRIVATE
-                    USE_SSE2NEON
+                    USE_SSE2_WITH_SSE2NEON
             )
         endif()
     endif(OCIO_USE_SSE)
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 7ac48ed1f6..c73541afb6 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -9,7 +9,6 @@
 
 namespace OCIO = OCIO_NAMESPACE;
 
-
 constexpr float qnan = std::numeric_limits<float>::quiet_NaN();
 constexpr float inf = std::numeric_limits<float>::infinity();
 
@@ -23,6 +22,7 @@ void TestLog(float logBase)
                                   0.f,        0.f,  0.f,     inf,
                                  -inf,       -inf, -inf,     0.f,
                                   0.f,        0.f,  0.f,    -inf };
+                                  
     float rgba[32] = {};
 
     OCIO::ConstLogOpDataRcPtr logOp = std::make_shared<OCIO::LogOpData>(
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -52,17 +52,26 @@ void TestLog(float logBase)
             expected = logf(std::max(minValue, (float)expected)) / logf(logBase);
         }
 
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_CLOSE(result, expected, error);
     }
 
     const float resMin = logf(minValue) / logf(logBase);
+
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
+
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], resMin, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
+
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -75,10 +84,16 @@ void TestLog(float logBase)
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
+
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], resMin, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
+
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
+
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], resMin, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -127,34 +142,45 @@ void TestAntiLog(float logBase)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison.
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[8], inf);
-    #endif
-#else
+
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[8], inf);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
+
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
-    #endif
-#else
+
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
+
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
-#ifdef USE_SSE
+
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
     OCIO_CHECK_EQUAL(rgba[24], 0.0f);
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
+
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -267,43 +293,48 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison.
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_EQUAL(rgba[8], inf);
-    #endif
-#else
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_EQUAL(rgba[8], inf);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
 #endif
 
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], res0, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
-    #endif
-#else
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
     OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol);
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -407,31 +438,40 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
 
         // LogOpCPU implementation uses optimized logarithm approximation
         // cannot use strict comparison
+        // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f,     0.f,
+        //                                                0.2f,       0.f,   .99f, 128.f,
+        //                                                ... }
         OCIO_CHECK_CLOSE(result, expected, error);
     }
 
     const float res0 = ComputeLin2LogEval(0.0f, redP);
     const float resMin = ComputeLin2LogEval(-100.0f, redP);
 
+    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
+    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
     OCIO_CHECK_CLOSE(rgba[12], res0, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
+    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
     OCIO_CHECK_CLOSE(rgba[20], res0, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
+    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
+    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
     OCIO_CHECK_CLOSE(rgba[28], res0, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -462,25 +502,28 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
 #endif // USE_SSE
 
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba[0], -0.168771237955f, error);
     OCIO_CHECK_CLOSE(rgba[1], -0.048771237955f, error);
     OCIO_CHECK_CLOSE(rgba[2], -0.036771237955f, error);
+
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
 #endif // USE_SSE
-
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
-#ifdef USE_SSE
+   // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -500,19 +543,23 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRendererNoLS = OCIO::GetLogRenderer(lognols, true);
     pRendererNoLS->apply(rgbaImage, rgba_nols, numPixels);
 
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[0], -0.325512374199f, error);
     OCIO_CHECK_CLOSE(rgba_nols[1], -0.127141806077f, error);
     OCIO_CHECK_CLOSE(rgba_nols[2], -0.107304749265f, error);
+    
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
 #endif // USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[6], 0.68141615509f, error);
-    OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
 
-#ifdef USE_SSE
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
+    OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -530,19 +577,25 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
 #endif // USE_SSE
+
+    // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[0], -24.6f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[1], -0.264385618977f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[2], -0.20700938942f, error2);
+
+    // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[4], 0.028548034423f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[5], 0.170878935551f, error2);
     OCIO_CHECK_CLOSE(rgba_nobreak[6], 0.68141615509, error2);
+
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -553,14 +606,8 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 {
     // Inverse of previous test.
-    const float rgbaImage[12] = { -0.168771237955f,
-                                  -0.048771237955f,
-                                  -0.036771237955f,
-                                   0.f,
-                                   0.047228762045f,
-                                   0.170878935551f,
-                                   0.68141615509f,
-                                   0.f,
+    const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f,
+                                   0.047228762045f, 0.170878935551f, 0.68141615509f, 0.f,
                                   -inf,   inf,   qnan,  0.0f };
 
     float rgba[12] = {};
@@ -574,23 +621,29 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
 #endif // USE_SSE
+
+    // Evaluating output for input rgbaImage[0-2] = 
+    // { -0.168771237955f, -0.048771237955f, -0.036771237955f, ... }.
     OCIO_CHECK_CLOSE(rgba[0], -0.1f, error);
     OCIO_CHECK_CLOSE(rgba[1],  0.0f, error);
     OCIO_CHECK_CLOSE(rgba[2],  0.01f, error);
+
+    // Evaluating output for input rgbaImage[4-6] = 
+    // { 0.047228762045f, 0.170878935551f, 0.68141615509f, ... }.
     OCIO_CHECK_CLOSE(rgba[4],  0.08f, error);
     OCIO_CHECK_CLOSE(rgba[5],  0.16f, error);
     OCIO_CHECK_CLOSE(rgba[6],  1.16f, 10.0f * error);
+
+    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba[8], -inf);
-#ifdef USE_SSE
-    #ifndef USE_SSE2NEON
-        OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
-    #endif
-#else
+#ifdef USING_INTEL_SSE2
+    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
+#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[9], inf);
 #endif
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));

From 6bac9f7a74384e5fce19e7f05ccc78ddc4c16c78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 09:18:49 -0500
Subject: [PATCH 60/81] Tweaking comments, some cmake variables naming and
 other minors changes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                              |  4 +-
 share/cmake/utils/CheckSupportARMNeon.cmake |  6 +--
 share/cmake/utils/CheckSupportSSE2.cmake    | 40 +++++++---------
 share/cmake/utils/CompilerFlags.cmake       |  1 -
 src/OpenColorIO/SSE.h                       |  6 +--
 tests/cpu/ops/log/LogOpCPU_tests.cpp        | 52 ++++++++++-----------
 6 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82ff26bcd1..2bf2da40d1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,7 +186,7 @@ include(CheckSupportGL)
 
 
 ###############################################################################
-# Check for ARM neon (if the user asked for it)
+# Check for ARM neon
 
 if(OCIO_USE_SSE)
     include(CheckSupportARMNeon)
@@ -197,7 +197,7 @@ endif()
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
 if(HAVE_NEON AND OCIO_USE_SSE)
-    # Install sse2nenon. Please note that sse2neon is downloaded during the configure step as it is
+    # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
     # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake
index acab5e5946..5d17854757 100644
--- a/share/cmake/utils/CheckSupportARMNeon.cmake
+++ b/share/cmake/utils/CheckSupportARMNeon.cmake
@@ -5,7 +5,7 @@
 
 include(CheckCXXSourceCompiles)
 
-set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(APPLE)
     set(CMAKE_OSX_ARCHITECTURES "arm64")
@@ -21,7 +21,7 @@ int main()
 
 check_cxx_source_compiles ("${source_code}" HAVE_NEON)
 
-set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
-unset(_cmake_osx_architectures_old)
+unset(_cmake_osx_architectures_orig)
 mark_as_advanced(HAVE_NEON)
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 311dbdbc87..26bc0b396d 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -3,10 +3,9 @@
 
 include(CheckCXXSourceCompiles)
 
-set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}")
-set(_cmake_required_libraries_old "${CMAKE_REQUIRED_LIBRARIES}")
-
-set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
+set(_cmake_required_libraries_orig "${CMAKE_REQUIRED_LIBRARIES}")
+set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
@@ -20,7 +19,7 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
 endif()
 
 
-macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
+macro(check_sse2_availability _check_sse2_header_ _check_output_var_name_)
     set(_SSE2_TEST_SOURCE_CODE "
     ${_check_sse2_header_}
     int main ()
@@ -38,8 +37,8 @@ macro(check_arm_neon_availability _check_sse2_header_ _check_output_var_name_)
 endmacro()
 
 if(NOT HAVE_NEON)
-    check_arm_neon_availability("#include <emmintrin.h>" HAVE_SSE2)
-elseif(APPLE)
+    check_sse2_availability("#include <emmintrin.h>" HAVE_SSE2)
+elseif(APPLE AND HAVE_NEON)
     # Test for both supported architectures
     # x86_64 and arm64
     set(ARCHITECTURES_LIST "arm64;x86_64")
@@ -50,32 +49,25 @@ elseif(APPLE)
         set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
         
         if(current_arch STREQUAL arm64)
-            if (HAVE_NEON)
-                set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-                set(_sse2_header_ "#include <sse2neon.h>")
-
-                set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
-            endif()
+            set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+            set(_sse2_header_ "#include <sse2neon.h>")
+            set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
         elseif(current_arch STREQUAL x86_64)
             set(_sse2_header_ "#include <emmintrin.h>")
             set(_output_var_name_ "HAVE_SSE2")
         endif()
 
-        check_arm_neon_availability("${_sse2_header_}" ${_output_var_name_})
+        check_sse2_availability("${_sse2_header_}" ${_output_var_name_})
 
     endforeach()
-elseif(NOT APPLE)
-    # Other arm platform
-    set(CMAKE_REQUIRED_LIBRARIES sse2neon)
-    check_arm_neon_availability("#include <sse2neon.h>" HAVE_SSE2)
 endif()
 
-set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}")
-set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_old}")
-set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_old}")
+set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}")
+set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_orig}")
+set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
-unset(_cmake_required_flags_old)
-unset(_cmake_required_libraries_old)
-unset(_cmake_osx_architectures_old)
+unset(_cmake_required_flags_orig)
+unset(_cmake_required_libraries_orig)
+unset(_cmake_osx_architectures_orig)
 
 
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 5420b495d4..a9a95d3753 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -11,7 +11,6 @@ set(PLATFORM_LINK_OPTIONS "")
 ###############################################################################
 # Define if SSE2 can be used.
 
-# Check for SSE2 only if the user asked for it.
 if(OCIO_USE_SSE)
     include(CheckSupportSSE2)
 endif()
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 3c40a858e1..12b43e8972 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,11 +6,9 @@
 #define INCLUDED_OCIO_SSE_H
 
 
-#if !defined(USE_SSE)
+#ifndef USE_SSE
     #define USING_CPP 1
-#endif
-
-#ifdef USE_SSE
+#else
 
 // If it is not arm64, same behavior as before.
 #if !defined(__aarch64__)
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index c73541afb6..083cd6a2b9 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -60,17 +60,17 @@ void TestLog(float logBase)
 
     const float resMin = logf(minValue) / logf(logBase);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], resMin, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     if (logBase == 10.0f)
     {
@@ -85,15 +85,15 @@ void TestLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], resMin, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], resMin, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -148,7 +148,7 @@ void TestAntiLog(float logBase)
         OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f));
     }
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[8], inf);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -156,11 +156,11 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -168,11 +168,11 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
@@ -180,7 +180,7 @@ void TestAntiLog(float logBase)
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -301,7 +301,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_EQUAL(rgba[8], inf);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -310,11 +310,11 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], res0, rtol);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #ifdef USING_INTEL_SSE2
     OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
 #elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
@@ -322,11 +322,11 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[24], inf);
 #else
@@ -334,7 +334,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], res0, rtol);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
@@ -447,15 +447,15 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     const float res0 = ComputeLin2LogEval(0.0f, redP);
     const float resMin = ComputeLin2LogEval(-100.0f, redP);
 
-    // Evaluating output for input rgbaImage[8] = qnan and rgbaImage[11] = 0.f.
+    // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
     OCIO_CHECK_CLOSE(rgba[8], resMin, error);
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
-    // Evaluating output for input rgbaImage[12] = 0.f and rgbaImage[15] = qnan.
+    // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
     OCIO_CHECK_CLOSE(rgba[12], res0, error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
-    // Evaluating output for input rgbaImage[16] = inf and rgbaImage[19] = 0.f.
+    // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
 #if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
@@ -463,20 +463,20 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
 #endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
-    // Evaluating output for input rgbaImage[20] = 0.f  and rgbaImage[23] = inf.
+    // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
     OCIO_CHECK_CLOSE(rgba[20], res0, error);
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
-    // Evaluating output for input rgbaImage[24] = -inf and rgbaImage[27] = 0.f.
+    // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
     OCIO_CHECK_CLOSE(rgba[24], resMin, error);
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
-    // Evaluating output for input rgbaImage[28] = 0.f and rgbaImage[31] = -inf.
+    // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
     OCIO_CHECK_CLOSE(rgba[28], res0, error);
     OCIO_CHECK_EQUAL(rgba[31], -inf);
 }
 
-OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
+OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 {
     constexpr int numPixels = 3;
     constexpr int numValues = 4 * numPixels;
@@ -603,7 +603,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO_CHECK_CLOSE(rgba_nobreak[10], -24.6f, error2);
 }
 
-OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
+OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 {
     // Inverse of previous test.
     const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f,

From 2422927846bfe6e1b4a21c03314b49dc7d86ded7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 11:34:52 -0500
Subject: [PATCH 61/81] Fixing issue with anti-log (SSE implementation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 src/OpenColorIO/SSE.h                | 80 ++++++++++++++++++----------
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 29 ----------
 2 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 12b43e8972..57ea87a08f 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -109,10 +109,10 @@ inline __m128 isNegativeSpecial(const __m128 x)
   return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(x), SIGN_SHIFT));
 }
 
-// Select function in SSE version 2
+// Bit-wise select function in SSE version 2
 //
-// Return the parameter arg_false when the parameter mask is 0x0,
-// or the parameter arg_true when the mask is 0xffffffff.
+// Return the parameter arg_false bit where the parameter mask is 0x0,
+// return the parameter arg_true bit where the mask is 0x1.
 //
 // Algorithm Explanation:
 //
@@ -142,7 +142,11 @@ inline __m128 isNegativeSpecial(const __m128 x)
 //
 inline __m128 sseSelect(const __m128& mask, const __m128& arg_true, const __m128& arg_false)
 {
-    return _mm_xor_ps( arg_false, _mm_and_ps( mask, _mm_xor_ps( arg_true, arg_false ) ) );
+    return _mm_xor_ps(                                  // bit-wise XOR of arg_false, (...)
+        arg_false, 
+        _mm_and_ps(                                     // bit-wise AND of mask, (...)
+            mask, 
+            _mm_xor_ps( arg_true, arg_false ) ) );      // bit-wise XOR of arg_true, arg_false
 }
 
 // Coefficients of Chebyshev (minimax) degree 5 polynomial
@@ -162,6 +166,10 @@ static const __m128 PNEXP2 = _mm_set1_ps((float)2.414427569091865207710e-1);
 static const __m128 PNEXP1 = _mm_set1_ps((float)6.930038344665415134202e-1);
 static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 
+// Note: The above polynomials have been chosen to acheive a precision of
+// approximately 15 bits of mantissa.
+
+
 // log2 function in SSE version 2
 //
 // The function log2() is evaluated by performing argument
@@ -169,12 +177,14 @@ static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 // over a restricted range.
 inline __m128 sseLog2(__m128 x)
 {
-    // y = log2( x ) = log2( 2^exposant * mantissa ) 
-    //               = exposant + log2( mantissa )
+    // y = log2( x ) = log2( 2^exponant * mantissa ) 
+    //               = exponant + log2( mantissa )
 
     __m128 mantissa
-        = _mm_or_ps(
-            _mm_andnot_ps(_mm_castsi128_ps(EMASK), x), EONE);
+        = _mm_or_ps(                                    // OR with EONE
+            _mm_andnot_ps(                              // NOT(EMASK) AND x
+                _mm_castsi128_ps(EMASK), x),            // reinterpret cast int to float
+            EONE);
 
     __m128 log2
         = _mm_add_ps(
@@ -198,14 +208,15 @@ inline __m128 sseLog2(__m128 x)
             PNLOG0);
 
     __m128i exponent
-        = _mm_sub_epi32(
-            _mm_srli_epi32(
-                _mm_and_si128(_mm_castps_si128(x),
+        = _mm_sub_epi32(                                // subtract EBIAS
+            _mm_srli_epi32(                             // right-shift by EXP_SHIFT
+                _mm_and_si128(_mm_castps_si128(x),      // bit-wise AND with EMASK
                     EMASK),
                 EXP_SHIFT),
             EBIAS);
 
-    log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(exponent));
+    log2 = _mm_add_ps(log2, 
+                      _mm_cvtepi32_ps(exponent));       // convert exponent to float
 
     return log2;
 }
@@ -224,24 +235,30 @@ inline __m128 sseExp2(__m128 x)
     // Compute the largest integer not greater than x, i.e., floor(x)
     // Note: cvttps_epi32 simply cast the float value to int. That means cvttps_epi32(-2.7) = -2
     // rather than -3, hence for negative numbers we need to add -1. This ensures that "fraction"
-    // is always in the range [0, 1).
+    // is always in the range [0, 1).  Note that _mm_castps_si128(0xFFFFFFFF) is -1.
+    // If x is outside the INT_MIN to INT_MAX range, _mm_cvttps_epi32 will return 0x80000000
+    // (i.e. INT_MIN, just the sign bit set), which Intel calls the "integer indefinite" value. 
+    // When 1 is subtracted from INT_MIN, it gives INT_MAX.  So floor_x is wrong for values
+    // outside [INT_MIN, INT_MAX] but it's ignored thanks to the checks at the bottom.
+    // It's also wrong for x=NaN, but again it's ok since the polynomial returns NaN and
+    // hence the output is NaN, regardless of floor_x.
     __m128i floor_x
-        = _mm_add_epi32(
-            _mm_cvttps_epi32(x),
-            _mm_castps_si128(
-                _mm_cmpnle_ps(EZERO, x)));
+        = _mm_add_epi32(                                // add a pair of integer arguments
+            _mm_cvttps_epi32(x),                        // convert float to int via truncation
+            _mm_castps_si128(                           // reinterpret cast float to int
+                _mm_cmpnle_ps(EZERO, x)));              // NOT( EZERO <= x ) ? 0xFFFFFFFF : 0
 
     // Compute exp2(floor_x) by moving floor_x to the exponent bits of the floating-point number.
     __m128 zf
-        = _mm_castsi128_ps(
-            _mm_slli_epi32(
-                _mm_add_epi32(floor_x, EBIAS),
+        = _mm_castsi128_ps(                             // reinterpret cast int to float
+            _mm_slli_epi32(                             // left shift by EXP_SHIFT
+                _mm_add_epi32(floor_x, EBIAS),          // add a pair of integer arguments
                 EXP_SHIFT));
 
-    __m128 iexp = _mm_cvtepi32_ps(floor_x);
-    __m128 fraction = _mm_sub_ps(x, iexp);
+    __m128 iexp = _mm_cvtepi32_ps(floor_x);             // convert floor_x to float
+    __m128 fraction = _mm_sub_ps(x, iexp);              // x - iexp
 
-    // Compute exp2(fraction) using a polynomial approximation
+    // Compute exp2(fraction) using a polynomial approximation.
     __m128 mexp
         = _mm_add_ps(
             _mm_mul_ps(
@@ -259,19 +276,26 @@ inline __m128 sseExp2(__m128 x)
                 fraction),
             PNEXP0);
 
-    __m128 exp2 = _mm_mul_ps(zf, mexp);
+    __m128 exp2 = _mm_mul_ps(zf, mexp);                 // zf * mexp
 
     // Handle underflow:
     // If the (unbiased) exponent of zf is less than -126, the result is smaller than
     // the smallest representable floating-point number and an underflow computation is
     // potentially happening. When this happens, force the result to zero.
-    exp2 = _mm_andnot_ps(_mm_cmplt_ps(iexp, ENEG126), exp2);
+    // Note that as described above, floor_x is inaccurate, so the test here uses x.
+    exp2 = _mm_andnot_ps(                               // NOT(...) AND exp2
+        _mm_cmplt_ps(x, ENEG126),                       // iexp < ENEG126 ? 0xFFFFFFFF : 0
+        exp2);
 
     // Handle overflow:
     // If the (unbiased) exponent of zf is greater than 127, the result is larger than
     // the largest representable floating-point number and an overflow computation is
     // potentially happening. When this happens, force the result to positive infinity.
-    exp2 = sseSelect(_mm_cmpgt_ps(iexp, EPOS127), EPOSINF, exp2);
+    // Note that as described above, floor_x is inaccurate, so the test here uses x.
+    exp2 = sseSelect(                                   // (...) is a mask to select EPOSINF, exp2
+        _mm_cmpgt_ps(x, EPOS127),                       // iexp > EPOS127 ? 0xFFFFFFFF : 0
+        EPOSINF, 
+        exp2);
 
     return exp2;
 }
@@ -630,7 +654,7 @@ inline void sseSinCos(const float x, float& sin_x, float& cos_x)
 } // namespace OCIO_NAMESPACE
 
 
-#endif
+#endif  // USE_SSE
 
 
-#endif
+#endif  // INCLUDED_OCIO_SSE_H
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 083cd6a2b9..35ec055856 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -149,11 +149,7 @@ void TestAntiLog(float logBase)
     }
 
     // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[8], inf);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
-#endif
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
     // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
@@ -161,11 +157,7 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
-#endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
     // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
@@ -173,11 +165,7 @@ void TestAntiLog(float logBase)
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
     // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
-    OCIO_CHECK_EQUAL(rgba[24], inf);
-#else
     OCIO_CHECK_EQUAL(rgba[24], 0.0f);
-#endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
     // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
@@ -302,12 +290,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     const float res0 = ComputeLog2LinEval(0.0f, redP);
 
     // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_EQUAL(rgba[8], inf);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8]));
-#endif
-
     OCIO_CHECK_EQUAL(rgba[11], 0.0f);
 
     // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}.
@@ -315,11 +298,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol);
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[16], inf);
-#endif
     OCIO_CHECK_EQUAL(rgba[19], 0.0f);
 
     // Evaluating output for input rgbaImage[20-23] =  {0., 0., 0., inf}.
@@ -327,11 +306,7 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test)
     OCIO_CHECK_EQUAL(rgba[23], inf);
 
     // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
-    OCIO_CHECK_EQUAL(rgba[24], inf);
-#else
     OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol);
-#endif
     OCIO_CHECK_EQUAL(rgba[27], 0.0f);
 
     // Evaluating output for input rgbaImage[28-31] = {0., 0.,  0., -inf}.
@@ -641,11 +616,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba[8], -inf);
-#ifdef USING_INTEL_SSE2
-    OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0
-#elif USING_CPP || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_EQUAL(rgba[9], inf);
-#endif
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
 }
 

From 769d26e6fa669940004a96c6e6539491460ea056 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 11:57:32 -0500
Subject: [PATCH 62/81] Adding a new option OCIO_USE_SIMD which does the same
 thing as OCIO_USE_SSE (they are sync up). Will replace OCIO_USE_SSE
 eventually.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 .github/workflows/ci_workflow.yml                |  6 +++---
 CMakeLists.txt                                   | 10 ++++++----
 docs/quick_start/installation.rst                |  2 ++
 share/cmake/modules/install/InstallOpenEXR.cmake |  2 ++
 share/cmake/utils/CompilerFlags.cmake            |  4 ++--
 src/OpenColorIO/CMakeLists.txt                   |  4 ++--
 tests/cpu/CMakeLists.txt                         |  6 +++---
 tests/gpu/CMakeLists.txt                         |  4 ++--
 tests/osl/CMakeLists.txt                         |  4 ++--
 9 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
index bf98506e1a..0fcf7e2c37 100644
--- a/.github/workflows/ci_workflow.yml
+++ b/.github/workflows/ci_workflow.yml
@@ -204,7 +204,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
@@ -345,7 +345,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
@@ -493,7 +493,7 @@ jobs:
                 -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \
                 -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \
                 -DOCIO_BUILD_GPU_TESTS=OFF \
-                -DOCIO_USE_SSE=${{ matrix.use-sse }} \
+                -DOCIO_USE_SIMD=${{ matrix.use-sse }} \
                 -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \
                 -DOCIO_INSTALL_EXT_PACKAGES=ALL \
                 -DOCIO_WARNING_AS_ERROR=ON \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bf2da40d1..7ea95a2463 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,11 +173,13 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
-
+# SIMD refers to either SIMD or Advanced SIMD terminology.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+mark_as_advanced(OCIO_USE_SSE)
+option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
+option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
-
 ###############################################################################
 # GPU configuration
 message(STATUS "")
@@ -188,7 +190,7 @@ include(CheckSupportGL)
 ###############################################################################
 # Check for ARM neon
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     include(CheckSupportARMNeon)
 endif()
 
@@ -196,7 +198,7 @@ endif()
 ###############################################################################
 # Add sse2neon to the build if ARM NEON intrinsics are supported.
 
-if(HAVE_NEON AND OCIO_USE_SSE)
+if(HAVE_NEON AND OCIO_USE_SIMD)
     # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 788ae7e16b..37dbe4c5df 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -278,6 +278,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding)
 - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins)
 - ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations)
+  - ``OCIO_USE_SSE`` will be deprecated in favor of ``OCIO_USE_SIMD`` at some point in the future.
+- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations)
 - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests)
 - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests)
 - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering)
diff --git a/share/cmake/modules/install/InstallOpenEXR.cmake b/share/cmake/modules/install/InstallOpenEXR.cmake
index 419fb4aa59..ffbd3a1f55 100644
--- a/share/cmake/modules/install/InstallOpenEXR.cmake
+++ b/share/cmake/modules/install/InstallOpenEXR.cmake
@@ -201,6 +201,7 @@ if(_OpenEXR_TARGET_CREATE)
         IMPORTED_LOCATION ${IlmThread_LIBRARY}
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}"
         INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;OpenEXR::IlmThreadConfig;OpenEXR::Iex;Threads::Threads"
+        STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols"
     )
     set_target_properties(OpenEXR::IlmThreadConfig PROPERTIES
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR};${OpenEXR_INCLUDE_DIR}/OpenEXR"
@@ -217,6 +218,7 @@ if(_OpenEXR_TARGET_CREATE)
         IMPORTED_LOCATION ${OpenEXRCore_LIBRARY}
         INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}"
         INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;ZLIB::ZLIB;\$<LINK_ONLY:Imath::ImathConfig>"
+        STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols"
     )
     set_target_properties(OpenEXR::OpenEXRUtil PROPERTIES
         IMPORTED_LOCATION ${OpenEXRUtil_LIBRARY}
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index a9a95d3753..d9aa05d43f 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -11,13 +11,13 @@ set(PLATFORM_LINK_OPTIONS "")
 ###############################################################################
 # Define if SSE2 can be used.
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     include(CheckSupportSSE2)
 endif()
 
 if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
-    set(OCIO_USE_SSE OFF)
+    set(OCIO_USE_SIMD OFF)
 endif()
 
 ###############################################################################
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 5fe522b0d5..791ade3f91 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -292,7 +292,7 @@ target_link_libraries(OpenColorIO
         MINIZIP::minizip-ng
 )
 
-if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
+if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
     target_link_libraries(OpenColorIO
         PRIVATE
             sse2neon
@@ -329,7 +329,7 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX))
     set_property(TARGET OpenColorIO PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     if(HAVE_SSE2)
         target_compile_definitions(OpenColorIO
             PRIVATE
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 53f50d1031..245caeb9cb 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -26,7 +26,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
             xxHash
     )
 
-    if(OCIO_USE_SSE AND HAVE_SSE2_WITH_SSE2NEON)
+    if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
         target_link_libraries(${TEST_BINARY}
             PRIVATE
                 sse2neon
@@ -51,7 +51,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
         )
     endif(PRIVATE_INCLUDES)
 
-    if(OCIO_USE_SSE)
+    if(OCIO_USE_SIMD)
         if (HAVE_SSE2)
             target_compile_definitions(${TEST_BINARY}
                 PRIVATE
@@ -65,7 +65,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
                     USE_SSE2_WITH_SSE2NEON
             )
         endif()
-    endif(OCIO_USE_SSE)
+    endif(OCIO_USE_SIMD)
 
     if(WIN32)
         # A windows application linking to eXpat static libraries must
diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt
index 2245fbf1d5..4a19e6b5d6 100644
--- a/tests/gpu/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -26,12 +26,12 @@ set(SOURCES
 
 add_executable(test_gpu_exec ${SOURCES})
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
 	target_compile_definitions(test_gpu_exec
 		PRIVATE
 			USE_SSE
 	)
-endif(OCIO_USE_SSE)
+endif(OCIO_USE_SIMD)
 
 set_target_properties(test_gpu_exec PROPERTIES 
 	COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"
diff --git a/tests/osl/CMakeLists.txt b/tests/osl/CMakeLists.txt
index 3316720d47..3f327ff295 100644
--- a/tests/osl/CMakeLists.txt
+++ b/tests/osl/CMakeLists.txt
@@ -18,12 +18,12 @@ set(SOURCES
 
 add_executable(test_osl_exec ${SOURCES})
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     target_compile_definitions(test_osl_exec
         PRIVATE
             USE_SSE
     )
-endif(OCIO_USE_SSE)
+endif(OCIO_USE_SIMD)
 
 set_target_properties(test_osl_exec PROPERTIES 
     COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"

From 6dddbe519a39f94190a7d7ec59a8139f0b68f11a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 08:57:13 -0500
Subject: [PATCH 63/81] Fix last issue discovered by unit test in sseExp2
 Reverted defined changes in LogOpCPU_tests Updated ocio.bat with
 OCIO_USE_SIMD Minors comments changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                       |  1 -
 docs/quick_start/installation.rst    |  5 ++---
 share/dev/windows/ocio.bat           |  2 +-
 src/OpenColorIO/SSE.h                | 14 ++++++--------
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 22 +++++++++++-----------
 5 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ea95a2463..df071b999d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,7 +173,6 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
-# SIMD refers to either SIMD or Advanced SIMD terminology.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 37dbe4c5df..64b973bc0a 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -277,9 +277,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_USE_OIIO_FOR_APPS=OFF`` (Set ON to build tools with OpenImageIO rather than OpenEXR)
 - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding)
 - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins)
-- ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations)
-  - ``OCIO_USE_SSE`` will be deprecated in favor of ``OCIO_USE_SIMD`` at some point in the future.
-- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations)
+- ``-DOCIO_USE_SSE=ON`` (Deprecated -- please use OCIO_USE_SIMD)
+- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON)
 - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests)
 - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests)
 - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering)
diff --git a/share/dev/windows/ocio.bat b/share/dev/windows/ocio.bat
index 7f24bc279b..a4762a97d9 100644
--- a/share/dev/windows/ocio.bat
+++ b/share/dev/windows/ocio.bat
@@ -206,7 +206,7 @@ if !DO_CONFIGURE!==1 (
         -DOCIO_BUILD_TESTS=ON^
         -DOCIO_BUILD_GPU_TESTS=ON^
         -DOCIO_BUILD_DOCS=OFF^
-        -DOCIO_USE_SSE=ON^
+        -DOCIO_USE_SIMD=ON^
         -DOCIO_WARNING_AS_ERROR=ON^
         -DOCIO_BUILD_JAVA=OFF^
         "!OCIO_PATH!"
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 57ea87a08f..95d7b88d49 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -6,19 +6,17 @@
 #define INCLUDED_OCIO_SSE_H
 
 
-#ifndef USE_SSE
-    #define USING_CPP 1
-#else
+#if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON)
 
 // If it is not arm64, same behavior as before.
 #if !defined(__aarch64__)
-    #include <emmintrin.h>
-    #define USING_INTEL_SSE2 1
+    #if defined(USE_SSE)
+        #include <emmintrin.h>
+    #endif
 #elif defined(__aarch64__)
     // ARM architecture A64 (ARM64)
     #if defined(USE_SSE2_WITH_SSE2NEON)
         #include <sse2neon.h>
-        #define USING_INTEL_SSE2_WITH_SSE2NEON 1
     #endif
 #endif
 
@@ -82,7 +80,7 @@ static const __m128i EBIAS = _mm_set1_epi32(EXP_BIAS);
 static const __m128 EONE    = _mm_set1_ps(1.0f);
 static const __m128 EZERO   = _mm_set1_ps(0.0f);
 static const __m128 ENEG126 = _mm_set1_ps(-126.0f);
-static const __m128 EPOS127 = _mm_set1_ps(127.0f);
+static const __m128 EPOS128 = _mm_set1_ps(128.0f);
 
 static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits<float>::infinity());
 
@@ -293,7 +291,7 @@ inline __m128 sseExp2(__m128 x)
     // potentially happening. When this happens, force the result to positive infinity.
     // Note that as described above, floor_x is inaccurate, so the test here uses x.
     exp2 = sseSelect(                                   // (...) is a mask to select EPOSINF, exp2
-        _mm_cmpgt_ps(x, EPOS127),                       // iexp > EPOS127 ? 0xFFFFFFFF : 0
+        _mm_cmpge_ps(x, EPOS128),                       // iexp > EPOS128 ? 0xFFFFFFFF : 0
         EPOSINF, 
         exp2);
 
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 35ec055856..04430b75f2 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -71,7 +71,7 @@ void TestLog(float logBase)
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -431,7 +431,7 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -498,7 +498,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -534,7 +534,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From 85ff4a00f5294443845a77e211b6e59fa410a68b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 09:24:39 -0500
Subject: [PATCH 64/81] Fixing typo, using #ifdef since this is what we were
 using before.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 04430b75f2..06e98fe0da 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -71,7 +71,7 @@ void TestLog(float logBase)
     // SSE implementation of sseLog2 & sseExp2 do not behave like CPU.
     // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below.
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USE_SSE
+#ifdef USE_SSE
     if (logBase == 10.0f)
     {
         OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error);
@@ -431,7 +431,7 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test)
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15]));
 
     // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}.
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error);
 #else
     OCIO_CHECK_EQUAL(rgba[16], inf);
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -498,7 +498,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error);
 
    // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_EQUAL(rgba[8], -inf);
     OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10]));
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -534,7 +534,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_EQUAL(rgba_nols[8], -inf);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error);
     OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10]));
 #else
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From 266f9407885a2bf118421ff337cce8f497ff8346 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 23 Feb 2023 10:40:07 -0500
Subject: [PATCH 65/81] Adding a line for universal build option for Cmake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 docs/quick_start/installation.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index 64b973bc0a..ab5dc74c0e 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -286,6 +286,10 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation)
 - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation)
 
+On the newest Apple chipset (M1+), a universal build can be done with the following option:
+
+- ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
+
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
 and therefore you will be limited to using OpenEXR files with these tools rather than the

From 89776745d33c4abe57be7f41008b5ea40f4f8504 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 3 Mar 2023 11:41:37 -0500
Subject: [PATCH 66/81] Documentations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                    | 4 +++-
 docs/quick_start/installation.rst | 3 +++
 src/OpenColorIO/SSE.h             | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df071b999d..aaae15a0e6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,9 +173,11 @@ endif()
 
 ###############################################################################
 # Optimization / internal linking preferences
+# TODO Remove OCIO_USE_SSE once it is fully deprecated.
 option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+# TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
-option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations (Will replace OCIO_USE_SSE)" ${OCIO_USE_SSE})
+option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
 option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index ab5dc74c0e..dfe55cdd8b 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -290,6 +290,9 @@ On the newest Apple chipset (M1+), a universal build can be done with the follow
 
 - ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
 
+Please note that OCIO dependencies must be built as universal libraries as well. If you can't build all the 
+dependencies as universal libraries, you can set ``OCIO_INSTAL_EXT_PACKAGES=ALL`` and OCIO will handle it.
+
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
 and therefore you will be limited to using OpenEXR files with these tools rather than the
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 95d7b88d49..81bd15c0dc 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -8,7 +8,7 @@
 
 #if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON)
 
-// If it is not arm64, same behavior as before.
+// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM).
 #if !defined(__aarch64__)
     #if defined(USE_SSE)
         #include <emmintrin.h>
@@ -110,7 +110,7 @@ inline __m128 isNegativeSpecial(const __m128 x)
 // Bit-wise select function in SSE version 2
 //
 // Return the parameter arg_false bit where the parameter mask is 0x0,
-// return the parameter arg_true bit where the mask is 0x1.
+// return the parameter arg_true bit where the mask is 1.
 //
 // Algorithm Explanation:
 //

From 7bde97e608ea744ec1c10f3cba853157a59e5c44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 6 Mar 2023 15:16:29 -0500
Subject: [PATCH 67/81] Universal build is the default for APPLE documentation
 tweak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                    | 10 +++++++++-
 docs/quick_start/installation.rst | 15 +++++++++------
 src/OpenColorIO/SSE.h             |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aaae15a0e6..f3b7d7d31a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,14 @@ if(APPLE AND NOT DEFINED CMAKE_OSX_DEPLOYMENT_TARGET)
 endif()
 
 
+###############################################################################
+# Universal library is the default for ARM architecture under MacOS.
+
+if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL ""))
+    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE)
+endif()
+
+
 ###############################################################################
 # Project definition.
 
@@ -174,7 +182,7 @@ endif()
 ###############################################################################
 # Optimization / internal linking preferences
 # TODO Remove OCIO_USE_SSE once it is fully deprecated.
-option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON)
+option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)" ON)
 # TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index dfe55cdd8b..fc58c5ec6f 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -286,12 +286,15 @@ Here are the most common OCIO-specific CMake options (the default values are sho
 - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation)
 - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation)
 
-On the newest Apple chipset (M1+), a universal build can be done with the following option:
-
-- ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``
-
-Please note that OCIO dependencies must be built as universal libraries as well. If you can't build all the 
-dependencies as universal libraries, you can set ``OCIO_INSTAL_EXT_PACKAGES=ALL`` and OCIO will handle it.
+On the MacOS under the ARM architecture, the default is to make a universal build 
+(natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option 
+may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``.
+
+When doing a universal build, note that the OCIO dependencies must be built as universal libraries 
+too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if 
+any of your installed libraries are not universal. The easiest way to address this is to set 
+OCIO_INSTALL_EXT_PACKAGES=ALL in order to let OCIO build everything. Alternatively, you may set 
+CMAKE_OSX_ARCHITECTURES to just the platform you are targeting.
 
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 81bd15c0dc..1aa853997e 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -33,7 +33,7 @@ namespace OCIO_NAMESPACE
 #if defined(__aarch64__)
     #if defined(USE_SSE2_WITH_SSE2NEON)
         // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to 
-        // NaN handling.
+        // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior.
 
         // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were 
         // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the 

From 418bd9076ffa539d6e7b2a57fd8f5d8e549d9490 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Tue, 7 Mar 2023 09:59:05 -0500
Subject: [PATCH 68/81] documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3b7d7d31a..687c0e4883 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
 
 
 ###############################################################################
-# Universal library is the default for ARM architecture under MacOS.
+# By default, build the library, tests, tools, and Python binding as universal binaries for macOS.
 
 if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL ""))
     set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE)

From dd15706cfea2d3087f64fff752daefc4beacd82b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 13 Mar 2023 10:49:01 -0400
Subject: [PATCH 69/81] Fixing issue for the static build test in CI workflow
 Removing Findsse2neon as it is not needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                | 26 +++++++--
 share/cmake/modules/Findsse2neon.cmake        | 58 -------------------
 .../modules/install/Installsse2neon.cmake     | 15 +++--
 share/cmake/utils/CheckSupportSSE2.cmake      |  8 +--
 src/OpenColorIO/CMakeLists.txt                |  5 +-
 tests/cpu/CMakeLists.txt                      |  5 +-
 6 files changed, 39 insertions(+), 78 deletions(-)
 delete mode 100644 share/cmake/modules/Findsse2neon.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 687c0e4883..e972eb8023 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,11 +211,29 @@ if(HAVE_NEON AND OCIO_USE_SIMD)
     # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is
     # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake.
 
-    # Could use ocio_handle_dependency once this is rebased from main (once the CMake min and 
-    # recommended version branch is merged).
-    find_package(sse2neon QUIET)
-    if (NOT sse2neon_FOUND)
+    # Sse2neon is not treated like an imported target. The logic to find sse2neon is here because 
+    # a find module is not suitable for sse2neon's use case.
+    find_path(sse2neon_INCLUDE_DIR
+        NAMES
+            sse2neon.h
+        HINTS
+            ${sse2neon_ROOT}
+        PATH_SUFFIXES
+            sse2neon
+            include
+            sse2neon/include
+    )
+
+    if (NOT sse2neon_INCLUDE_DIR)
         include(Installsse2neon)
+    else()
+        # Any changes to the following lines must be replicated in Installsse2neon.cmake as well.
+        # Create a target for sse2neon (non-imported)
+        add_library(sse2neon INTERFACE)
+        # Add the include directories to the target.
+        target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
+        # Ignore the warnings coming from sse2neon.h as they are false positives.
+        target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
     endif()
 endif()
 
diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
deleted file mode 100644
index 7dea6740cb..0000000000
--- a/share/cmake/modules/Findsse2neon.cmake
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-#
-# Locate sse2neon (header-only version)
-#
-# Variables defined by this module:
-#   sse2neon_FOUND          - Indicate whether the library was found or not
-#   sse2neon_INCLUDE_DIR    - Location of the header files
-#
-# Global targets defined by this module:
-#   sse2neon
-###############################################################################
-### Try to find package ###
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-    # Find include directory
-    find_path(sse2neon_INCLUDE_DIR
-        NAMES
-            sse2neon.h
-        HINTS
-            ${sse2neon_ROOT}
-        PATH_SUFFIXES
-            include
-            sse2neon/include
-    )
-
-    # Override REQUIRED if package can be installed
-    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
-        set(sse2neon_FIND_REQUIRED FALSE)
-    endif()
-
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(sse2neon
-        REQUIRED_VARS 
-            sse2neon_INCLUDE_DIR 
-    )
-    set(sse2neon_FOUND ${sse2neon_FOUND})
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(sse2neon_FOUND AND NOT TARGET sse2neon)
-    # INTERFACE type since we know that this is a header-only library.
-    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-    set(_sse2neon_TARGET_CREATE TRUE)
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(_sse2neon_TARGET_CREATE)
-    set_target_properties(sse2neon PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
-    )
-
-    mark_as_advanced(sse2neon_INCLUDE_DIR)
-endif()
\ No newline at end of file
diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake
index 16b47798cd..5f0f810ca1 100644
--- a/share/cmake/modules/install/Installsse2neon.cmake
+++ b/share/cmake/modules/install/Installsse2neon.cmake
@@ -29,8 +29,15 @@ if(NOT sse2neon_POPULATED)
   set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}")
   file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon")
 
-  add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-  set_target_properties(sse2neon PROPERTIES
-      INTERFACE_INCLUDE_DIRECTORIES "${_EXT_DIST_INCLUDE}/sse2neon"
-  )
+  # sse2neon_INCLUDE_DIR is used internally for CheckSupportSSE2.cmake and to create sse2neon
+  # target for OCIO.
+  set(sse2neon_INCLUDE_DIR "${sse2neon_SOURCE_DIR}")
+
+  # Any changes to the following lines must be replicated in ./CMakeLists.txt as well.
+  # Create a target for sse2neon (non-imported)
+  add_library(sse2neon INTERFACE)
+  # Add the include directories to the target.
+  target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
+  # Ignore the warnings coming from sse2neon.h as they are false positives.
+  target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
 endif()
\ No newline at end of file
diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 26bc0b396d..07fecbd7a5 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -4,7 +4,7 @@
 include(CheckCXXSourceCompiles)
 
 set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
-set(_cmake_required_libraries_orig "${CMAKE_REQUIRED_LIBRARIES}")
+set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}")
 set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
@@ -49,7 +49,7 @@ elseif(APPLE AND HAVE_NEON)
         set (CMAKE_OSX_ARCHITECTURES "${current_arch}")
         
         if(current_arch STREQUAL arm64)
-            set(CMAKE_REQUIRED_LIBRARIES sse2neon)
+            set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR})
             set(_sse2_header_ "#include <sse2neon.h>")
             set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON")
         elseif(current_arch STREQUAL x86_64)
@@ -63,11 +63,11 @@ elseif(APPLE AND HAVE_NEON)
 endif()
 
 set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}")
-set(CMAKE_REQUIRED_LIBRARIES "${_cmake_required_libraries_orig}")
+set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}")
 set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
 
 unset(_cmake_required_flags_orig)
-unset(_cmake_required_libraries_orig)
+unset(_cmake_required_includes_orig)
 unset(_cmake_osx_architectures_orig)
 
 
diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 791ade3f91..608cbac63a 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -293,10 +293,7 @@ target_link_libraries(OpenColorIO
 )
 
 if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
-    target_link_libraries(OpenColorIO
-        PRIVATE
-            sse2neon
-    )
+    target_link_libraries(OpenColorIO PRIVATE $<BUILD_INTERFACE:sse2neon>)
 endif()
 
 if(APPLE)
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 245caeb9cb..edbb21ad2c 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -27,10 +27,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES)
     )
 
     if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON)
-        target_link_libraries(${TEST_BINARY}
-            PRIVATE
-                sse2neon
-        )
+        target_link_libraries(${TEST_BINARY} PRIVATE sse2neon)
     endif()
 
     if(APPLE)

From 3d8e35018f8c6417167d20d41b147aa170a92e75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 1 Feb 2023 09:34:47 -0500
Subject: [PATCH 70/81] Adding support for sse2neon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 ext/sse2neon/CMakeLists.txt           |    9 +
 ext/sse2neon/src/include/sse2neon.h   | 9079 +++++++++++++++++++++++++
 share/cmake/utils/CompilerFlags.cmake |   14 +
 3 files changed, 9102 insertions(+)
 create mode 100644 ext/sse2neon/CMakeLists.txt
 create mode 100644 ext/sse2neon/src/include/sse2neon.h

diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
new file mode 100644
index 0000000000..7fd5fcb302
--- /dev/null
+++ b/ext/sse2neon/CMakeLists.txt
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+
+# sse2neon (modified)
+# https://github.com/DLTcollab/sse2neon
+add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+set_target_properties(sse2neon PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
+)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
new file mode 100644
index 0000000000..164c6c3387
--- /dev/null
+++ b/ext/sse2neon/src/include/sse2neon.h
@@ -0,0 +1,9079 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#else /* non-GNU / non-clang compilers */
+#warning "Macro name collisions may happen with unsupported compiler."
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined(_WIN32)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+ * from both MinGW-w64 and MSVC.
+ */
+#define SSE2NEON_ALLOC_DEFINED
+#endif
+
+/* If using MSVC */
+#ifdef _MSC_VER
+#include <intrin.h>
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+    (defined(_M_ARM) || defined(__arm__))
+#define SSE2NEON_HAS_BITSCAN64
+#endif
+#endif
+
+/* Compiler barrier */
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+#if !defined(__aarch64__) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchtectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if !defined(__aarch64__)
+#include <sys/time.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+
+// __builtin_shuffle introduced in GCC 4.7.0
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+#define HAS__builtin_shuffle 1
+#else
+#define HAS__builtin_shuffle 0
+#endif
+
+#define HAS__builtin_shufflevector 0
+#define HAS__builtin_nontemporal_store 0
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif __has_builtin(__builtin_shuffle)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __extension__({                        \
+        type tmp = {__VA_ARGS__};          \
+        __builtin_shuffle(a, b, tmp);      \
+    })
+#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128();
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddv u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#else
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+#else
+// Wraps vaddvq_u8
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    return vaddvq_u8(a);
+}
+#endif
+
+#if !defined(__aarch64__)
+/* emulate vaddvq u16 variant */
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    uint32x4_t m = vpaddlq_u16(a);
+    uint64x2_t n = vpaddlq_u32(m);
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
+
+    return vget_lane_u32((uint32x2_t) o, 0);
+}
+#else
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+#if defined(__ARM_FEATURE_CRYPTO) && \
+    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+#endif
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+#if defined(aarch64__)
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+#else
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+#endif
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache heirarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr()
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                           \
+    __extension__({                                                        \
+        vreinterpret_m64_s16(vshuffle_s16(                                 \
+            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
+    })
+#else
+#define _mm_shuffle_pi16(a, imm)                                               \
+    __extension__({                                                            \
+        int16x4_t ret;                                                         \
+        ret =                                                                  \
+            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
+            1);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
+            2);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
+            3);                                                                \
+        vreinterpret_m64_s16(ret);                                             \
+    })
+#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#else
+    /* FIXME: MSVC support */
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 >= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 > *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 <= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 < *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
+                                                 vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double ret = *((double *) &a);
+    return (int32_t) ret;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__)
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+#else
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+#endif
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+#else
+    return _mm_set_pd(0, a);
+#endif
+}
+
+// Broadcast 16-bit integer a to all all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                         \
+    __extension__({                                                    \
+        int8x16_t ret;                                                 \
+        if (_sse2neon_unlikely(imm == 0))                              \
+            ret = vreinterpretq_s8_m128i(a);                           \
+        else if (_sse2neon_unlikely((imm) & ~15))                      \
+            ret = vdupq_n_s8(0);                                       \
+        else                                                           \
+            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
+                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
+        vreinterpretq_m128i_s8(ret);                                   \
+    })
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) == 0)) {                                \
+            ret = a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~15)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u16(                                   \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~31)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u32(                                   \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~63)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u64(                                   \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                       \
+    __extension__({                                                  \
+        int8x16_t ret;                                               \
+        if (_sse2neon_unlikely((imm) & ~15))                         \
+            ret = vdupq_n_s8(0);                                     \
+        else                                                         \
+            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
+                           (imm > 15 ? 0 : imm));                    \
+        vreinterpretq_m128i_s8(ret);                                 \
+    })
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+{
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (_sse2neon_unlikely((imm) >= 16)) {                              \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+#else
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
+    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(floor(f[1]), floor(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                              \
+    __extension__({                                                            \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
+                           vreinterpretq_f32_m128(a), 0);                      \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
+                           ((imm8 >> 4) & 0x3));                               \
+        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        vreinterpretq_m128_f32(                                                \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
+    })
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+#if defined(__aarch64__)
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+#else
+    // Find the minimum value
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+#endif
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+#if defined(__aarch64__)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t zf =
+        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t cf =
+        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t result = vandq_u64(zf, cf);
+    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+#if defined(__aarch64__)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#else
+        uint64x2_t sumh = vpaddlq_u32(vec_res);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+    case _SIDD_NEGATIVE_POLARITY:
+        res ^= 0xffffffff;
+        break;
+    case _SIDD_MASKED_NEGATIVE_POLARITY:
+        res ^= (1 << lb) - 1;
+        break;
+    default:
+        break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_clz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+#if _MSC_VER
+    DWORD cnt = 0;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+#if _MSC_VER
+    unsigned long cnt;
+#ifdef defined(SSE2NEON_HAS_BITSCAN64)
+    (defined(_M_AMD64) || defined(__x86_64__))
+        if((_BitScanForward64(&cnt, x))
+            return (int)(cnt);
+#else
+    if (_BitScanForward(&cnt, (unsigned long) (x)))
+        return (int) cnt;
+    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+        return (int) (cnt + 32);
+#endif
+    return 64;
+#else
+    return x != 0 ? __builtin_ctzll(x) : 64;
+#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        } else {                                                               \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    } else {                                                                   \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        } else {                                                               \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        } else {                                                         \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+#if !defined(__aarch64__)
+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define SSE2NEON_MULTIPLY(x, y)                                  \
+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A implementation for a table-based AES */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+// muliplying 'x' by 2 in GF(2^8)
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+// muliplying 'x' by 3 in GF(2^8)
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+
+    // this generates a table containing every possible permutation of
+    // shift_rows() and sub_bytes() with mix_columns().
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
+    uint32_t x1 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
+    uint32_t x2 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
+    uint32_t x3 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
+
+    // finish the modulo addition step in mix_columns()
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // muliplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
+                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t i, e, f, g, h, v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    // inverse mix columns
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A implementation */
+    uint8_t v[16] = {
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+    };
+
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (int i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+#if defined(__aarch64__)
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+
+#else /* ARMv7-A NEON implementation */
+    uint8_t i, e, f, g, h, v[4][4];
+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+#endif
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+#if defined(__aarch64__)
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+
+#else /* ARMv7-A NEON implementation */
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+#endif
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
+        vreinterpretq_u8_m128i(RoundKey));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
\ No newline at end of file
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index d9aa05d43f..3346c22a01 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -23,6 +23,20 @@ endif()
 ###############################################################################
 # Compile flags
 
+###############################################################################
+# Define if SSE2 can be used.
+# Check for SSE2 first since some compile flags need to be set on Apple ARM.
+
+include(CheckSupportSSE2)
+
+if(NOT HAVE_SSE2)
+    message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
+    set(OCIO_USE_SSE OFF)
+endif(NOT HAVE_SSE2)
+
+###############################################################################
+# Compile flags
+
 if(USE_MSVC)
 
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/DUSE_MSVC")

From 9e2347ff2c3fc7320c9a73eac2d057d3a1b6075e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 10 Feb 2023 14:36:26 -0500
Subject: [PATCH 71/81] Changed how we handle sse2neon to match other
 dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 ext/sse2neon/CMakeLists.txt            |    9 -
 ext/sse2neon/src/include/sse2neon.h    | 9079 ------------------------
 share/cmake/modules/Findsse2neon.cmake |   58 +
 3 files changed, 58 insertions(+), 9088 deletions(-)
 delete mode 100644 ext/sse2neon/CMakeLists.txt
 delete mode 100644 ext/sse2neon/src/include/sse2neon.h
 create mode 100644 share/cmake/modules/Findsse2neon.cmake

diff --git a/ext/sse2neon/CMakeLists.txt b/ext/sse2neon/CMakeLists.txt
deleted file mode 100644
index 7fd5fcb302..0000000000
--- a/ext/sse2neon/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-
-# sse2neon (modified)
-# https://github.com/DLTcollab/sse2neon
-add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-set_target_properties(sse2neon PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_SOURCE_DIR}/src/include"
-)
diff --git a/ext/sse2neon/src/include/sse2neon.h b/ext/sse2neon/src/include/sse2neon.h
deleted file mode 100644
index 164c6c3387..0000000000
--- a/ext/sse2neon/src/include/sse2neon.h
+++ /dev/null
@@ -1,9079 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// Contributors to this work are:
-//   John W. Ratcliff <jratcliffscarab@gmail.com>
-//   Brandon Rowlett <browlett@nvidia.com>
-//   Ken Fast <kfast@gdeb.com>
-//   Eric van Beurden <evanbeurden@nvidia.com>
-//   Alexander Potylitsin <apotylitsin@nvidia.com>
-//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
-//   Jim Huang <jserv@ccns.ncku.edu.tw>
-//   Mark Cheng <marktwtn@gmail.com>
-//   Malcolm James MacLeod <malcolm@gulden.com>
-//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
-//   Sebastian Pop <spop@amazon.com>
-//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
-//   Danila Kutenin <danilak@google.com>
-//   François Turban (JishinMaster) <francois.turban@gmail.com>
-//   Pei-Hsuan Hung <afcidk@gmail.com>
-//   Yang-Hao Yuan <yuanyanghau@gmail.com>
-//   Syoyo Fujita <syoyo@lighttransport.com>
-//   Brecht Van Lommel <brecht@blender.org>
-//   Jonathan Hue <jhue@adobe.com>
-//   Cuda Chen <clh960524@gmail.com>
-//   Aymen Qader <aymen.qader@arm.com>
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Tunable configurations */
-
-/* Enable precise implementation of math operations
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
- */
-/* _mm_min|max_ps|ss|pd|sd */
-#ifndef SSE2NEON_PRECISE_MINMAX
-#define SSE2NEON_PRECISE_MINMAX (0)
-#endif
-/* _mm_rcp_ps and _mm_div_ps */
-#ifndef SSE2NEON_PRECISE_DIV
-#define SSE2NEON_PRECISE_DIV (0)
-#endif
-/* _mm_sqrt_ps and _mm_rsqrt_ps */
-#ifndef SSE2NEON_PRECISE_SQRT
-#define SSE2NEON_PRECISE_SQRT (0)
-#endif
-/* _mm_dp_pd */
-#ifndef SSE2NEON_PRECISE_DP
-#define SSE2NEON_PRECISE_DP (0)
-#endif
-
-/* compiler specific definitions */
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
-#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
-#else /* non-GNU / non-clang compilers */
-#warning "Macro name collisions may happen with unsupported compiler."
-#ifndef FORCE_INLINE
-#define FORCE_INLINE static inline
-#endif
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#define _sse2neon_likely(x) (x)
-#define _sse2neon_unlikely(x) (x)
-#endif
-
-/* C language does not allow initializing a variable with a function call. */
-#ifdef __cplusplus
-#define _sse2neon_const static const
-#else
-#define _sse2neon_const const
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#if defined(_WIN32)
-/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
- * from both MinGW-w64 and MSVC.
- */
-#define SSE2NEON_ALLOC_DEFINED
-#endif
-
-/* If using MSVC */
-#ifdef _MSC_VER
-#include <intrin.h>
-#if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM) || defined(__arm__))
-#define SSE2NEON_HAS_BITSCAN64
-#endif
-#endif
-
-/* Compiler barrier */
-#define SSE2NEON_BARRIER()                     \
-    do {                                       \
-        __asm__ __volatile__("" ::: "memory"); \
-        (void) 0;                              \
-    } while (0)
-
-/* Memory barriers
- * __atomic_thread_fence does not include a compiler barrier; instead,
- * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
- * semantics.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-#include <stdatomic.h>
-#endif
-
-FORCE_INLINE void _sse2neon_smp_mb(void)
-{
-    SSE2NEON_BARRIER();
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
-    !defined(__STDC_NO_ATOMICS__)
-    atomic_thread_fence(memory_order_seq_cst);
-#elif defined(__GNUC__) || defined(__clang__)
-    __atomic_thread_fence(__ATOMIC_SEQ_CST);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
- */
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("fpu=neon")
-#endif
-#elif defined(__aarch64__)
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("+simd")
-#endif
-#elif __ARM_ARCH == 8
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error \
-    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
-#endif
-#if !defined(__clang__)
-#pragma GCC push_options
-#endif
-#else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
-#endif
-#endif
-
-#include <arm_neon.h>
-#if !defined(__aarch64__) && (__ARM_ARCH == 8)
-#if defined __has_include && __has_include(<arm_acle.h>)
-#include <arm_acle.h>
-#endif
-#endif
-
-/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
- * and other Arm microarchtectures use.
- * From sysctl -a on Apple M1:
- * hw.cachelinesize: 128
- */
-#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
-#define SSE2NEON_CACHELINE_SIZE 128
-#else
-#define SSE2NEON_CACHELINE_SIZE 64
-#endif
-
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
-#include <math.h>
-#endif
-
-/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
- * or even not accessible in user mode.
- * To write or access to these registers in user mode,
- * we have to perform syscall instead.
- */
-#if !defined(__aarch64__)
-#include <sys/time.h>
-#endif
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if defined(__GNUC__) && (__GNUC__ <= 9)
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-
-// __builtin_shuffle introduced in GCC 4.7.0
-#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
-#define HAS__builtin_shuffle 1
-#else
-#define HAS__builtin_shuffle 0
-#endif
-
-#define HAS__builtin_shufflevector 0
-#define HAS__builtin_nontemporal_store 0
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
-    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-#if __has_builtin(__builtin_shufflevector)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __builtin_shufflevector(a, b, __VA_ARGS__)
-#elif __has_builtin(__builtin_shuffle)
-#define _sse2neon_shuffle(type, a, b, ...) \
-    __extension__({                        \
-        type tmp = {__VA_ARGS__};          \
-        __builtin_shuffle(a, b, tmp);      \
-    })
-#endif
-
-#ifdef _sse2neon_shuffle
-#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
-#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
-#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
-#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
-#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
-#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
-#endif
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-#define _MM_FROUND_RAISE_EXC 0x00
-#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
-#define _MM_ROUND_NEAREST 0x0000
-#define _MM_ROUND_DOWN 0x2000
-#define _MM_ROUND_UP 0x4000
-#define _MM_ROUND_TOWARD_ZERO 0x6000
-/* Flush zero mode macros. */
-#define _MM_FLUSH_ZERO_MASK 0x8000
-#define _MM_FLUSH_ZERO_ON 0x8000
-#define _MM_FLUSH_ZERO_OFF 0x0000
-/* Denormals are zeros mode macros. */
-#define _MM_DENORMALS_ZERO_MASK 0x0040
-#define _MM_DENORMALS_ZERO_ON 0x0040
-#define _MM_DENORMALS_ZERO_OFF 0x0000
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef int64x1_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-// __int64 is defined in the Intrinsics Guide which maps to different datatype
-// in different data model
-#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
-#if (defined(__x86_64__) || defined(__i386__))
-#define __int64 long long
-#else
-#define __int64 int64_t
-#endif
-#endif
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
-#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64_s64(x) (x)
-
-#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
-#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
-#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
-
-#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64(x) (x)
-
-#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
-
-#if defined(__aarch64__)
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
-
-#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
-#define vreinterpretq_m128d_f64(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
-
-#define vreinterpretq_f64_m128d(x) (x)
-#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
-#else
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128d_f32(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_f32_m128d(x) (x)
-#endif
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an __m128 struct
-// directly.  It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://learn.microsoft.com/en-us/cpp/cpp/m128
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides.  This union should really only be used when
-// trying to access the members of the vector as integer values.  GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit.  If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components.  The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
-    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
-    int8_t m128_i8[16];    // as signed 8-bit integers.
-    int16_t m128_i16[8];   // as signed 16-bit integers.
-    int32_t m128_i32[4];   // as signed 32-bit integers.
-    int64_t m128_i64[2];   // as signed 64-bit integers.
-    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
-    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
-    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
-    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
-
-/* SSE macros */
-#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
-#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
-#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
-#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
-
-// Function declaration
-// SSE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
-FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
-FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
-FORCE_INLINE __m128 _mm_set_ps1(float);
-FORCE_INLINE __m128 _mm_setzero_ps(void);
-// SSE2
-FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_castps_si128(__m128);
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
-FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
-FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
-FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
-FORCE_INLINE __m128d _mm_set_pd(double, double);
-FORCE_INLINE __m128i _mm_set1_epi32(int);
-FORCE_INLINE __m128i _mm_setzero_si128();
-// SSE4.1
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
-FORCE_INLINE __m128 _mm_ceil_ps(__m128);
-FORCE_INLINE __m128d _mm_floor_pd(__m128d);
-FORCE_INLINE __m128 _mm_floor_ps(__m128);
-FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
-FORCE_INLINE __m128 _mm_round_ps(__m128, int);
-// SSE4.2
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__) &&                        \
-    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
-     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
-     (__GNUC__ <= 9 && defined(__aarch64__)))
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    uint8x16x4_t ret;
-    ret.val[0] = vld1q_u8(p + 0);
-    ret.val[1] = vld1q_u8(p + 16);
-    ret.val[2] = vld1q_u8(p + 32);
-    ret.val[3] = vld1q_u8(p + 48);
-    return ret;
-}
-#else
-// Wraps vld1q_u8_x4
-FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
-{
-    return vld1q_u8_x4(p);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddv u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
-    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
-}
-#else
-// Wraps vaddv_u8
-FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
-{
-    return vaddv_u8(v8);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u8 variant */
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
-    uint8_t res = 0;
-    for (int i = 0; i < 8; ++i)
-        res += tmp[i];
-    return res;
-}
-#else
-// Wraps vaddvq_u8
-FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
-{
-    return vaddvq_u8(a);
-}
-#endif
-
-#if !defined(__aarch64__)
-/* emulate vaddvq u16 variant */
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    uint32x4_t m = vpaddlq_u16(a);
-    uint64x2_t n = vpaddlq_u32(m);
-    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
-
-    return vget_lane_u32((uint32x2_t) o, 0);
-}
-#else
-// Wraps vaddvq_u16
-FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-{
-    return vaddvq_u16(a);
-}
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- *   _mm_<name>_<data_type>
- *
- * The parts of this format are given as follows:
- * 1. <name> describes the operation performed by the intrinsic
- * 2. <data_type> identifies the data type of the function's primary arguments
- *
- * This last part, <data_type>, is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- *                      than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- *   // Set packed 8-bit integers
- *   // 128 bits, 16 chars, per 8 bits
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
- *   // Shuffle packed 8-bit integers
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- */
-
-/* Constants for use with _mm_prefetch. */
-enum _mm_hint {
-    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
-    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
-    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
-    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
-};
-
-// The bit field mapping to the FPCR(floating-point control register)
-typedef struct {
-    uint16_t res0;
-    uint8_t res1 : 6;
-    uint8_t bit22 : 1;
-    uint8_t bit23 : 1;
-    uint8_t bit24 : 1;
-    uint8_t res2 : 7;
-#if defined(__aarch64__)
-    uint32_t res3;
-#endif
-} fpcr_bitfield;
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-    float32x2_t a21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-    float32x2_t a03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-    float32x2_t a33 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
-{
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
-}
-
-#if defined(__ARM_FEATURE_CRYPTO) && \
-    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else  // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-                                    vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-                                    vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
-    uint8x16_t e =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
-    uint8x16_t f =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
-    uint8x16_t g =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
-    uint8x16_t h =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
-    uint8x16_t i =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
-    uint8x16_t j =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
-    uint8x16_t k =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f);  // L = E + F
-    uint8x16_t m = veorq_u8(g, h);  // M = G + H
-    uint8x16_t n = veorq_u8(i, j);  // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix = veorq_u8(d, cross1);
-    uint8x16_t r = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
-}
-#endif  // ARMv7 polyfill
-
-// C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
-//       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-#define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most significant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least significant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
-#else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
-#endif
-
-// NEON does not support a general purpose permute intrinsic.
-// Shuffle single-precision (32-bit) floating-point elements in a using the
-// control in imm8, and store the results in dst.
-//
-// C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
-//       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
-    __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
-    })
-
-// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
-// Store the results in the low 64 bits of dst, with the high 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
-#define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-        int16x4_t lowBits = vget_low_s16(ret);                                \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
-                             1);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
-                             2);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
-                             3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
-
-// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
-// Store the results in the high 64 bits of dst, with the low 64 bits being
-// copied from from a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
-#define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-        int16x4_t highBits = vget_high_s16(ret);                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
-                             5);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
-                             6);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
-                             7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
-
-/* MMX */
-
-//_mm_empty is a no-op on arm
-FORCE_INLINE void _mm_empty(void) {}
-
-/* SSE */
-
-// Add packed single-precision (32-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Add the lower single-precision (32-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-    // the upper values in the result must be the remnants of <a>.
-    return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Compute the bitwise AND of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vbicq_s32(vreinterpretq_s32_m128(b),
-                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(
-        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmple_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
-//
-// See also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
-    // Note: NEON does not have ordered compare builtin
-    // Need to compare a eq a and b eq b to check for NaN
-    // Do AND of results to get final
-    uint32x4_t ceqaa =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t ceqbb =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-{
-    uint32x4_t f32a =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t f32b =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_eq_b =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_ge_b =
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_gt_b =
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_le_b =
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_le_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_lt_b =
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
-}
-
-// Compare the lower single-precision (32-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-    return !_mm_comieq_ss(a, b);
-}
-
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
-FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
-#else
-    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
-#endif
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
-                          0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int32_t) data;
-#endif
-}
-
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
-}
-
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then convert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
-}
-
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 16-bit integers, and store the results in dst. Note: this intrinsic
-// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
-// 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
-FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
-{
-    return vreinterpret_m64_s16(
-        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
-#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 8-bit integers, and store the results in lower 4 elements of dst.
-// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
-// between 0x7F and 0x7FFFFFFF.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
-FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
-{
-    return vreinterpret_m64_s8(vqmovn_s16(
-        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
-}
-
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
-}
-
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_u32(
-        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
-#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
-
-// Convert the signed 64-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
-FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
-#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
-FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
-#else
-    float32_t data = vgetq_lane_f32(
-        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int64_t) data;
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
-FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
-{
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
-FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
-{
-    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
-#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
-#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
-FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
-{
-    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Divide packed single-precision (32-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
-#endif
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
-#endif
-}
-
-// Divide the lower single-precision (32-bit) floating-point element in a by the
-// lower single-precision (32-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper 3 packed elements from a to
-// the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
-#define _mm_extract_pi16(a, imm) \
-    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
-
-// Free aligned memory that was allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void _mm_free(void *addr)
-{
-    free(addr);
-}
-#endif
-
-// Macro: Get the flush zero bits from the MXCSR control and status register.
-// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
-// _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
-FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
-}
-
-// Macro: Get the rounding mode bits from the MXCSR control and status register.
-// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
-// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
-    }
-}
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm)                               \
-    __extension__({                                              \
-        vreinterpret_m64_s16(                                    \
-            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[63:32] := MEM[mem_addr+31:mem_addr]
-//   dst[95:64] := MEM[mem_addr+31:mem_addr]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
-
-// Load a single-precision (32-bit) floating-point element from memory into the
-// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
-}
-
-// Load 2 single-precision (32-bit) floating-point elements from memory into the
-// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
-// mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
-}
-
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
-{
-    float32x4_t v = vrev64q_f32(vld1q_f32(p));
-    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
-}
-
-// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from memory into dst. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
-    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-    // equivalent for neon
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load unaligned 16-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
-{
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
-}
-
-// Load unaligned 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
-}
-
-// Allocate size bytes of memory, aligned to the alignment specified in align,
-// and return a pointer to the allocated memory. _mm_free should be used to free
-// memory that is allocated with _mm_malloc.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
-#if !defined(SSE2NEON_ALLOC_DEFINED)
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-    void *ptr;
-    if (align == 1)
-        return malloc(size);
-    if (align == 2 || (sizeof(void *) == 8 && align == 4))
-        align = sizeof(void *);
-    if (!posix_memalign(&ptr, align, size))
-        return ptr;
-    return NULL;
-}
-#endif
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
-FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
-{
-    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x8_t masked =
-        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
-                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
-    vst1_s8((int8_t *) mem_addr, masked);
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
-#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed maximum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed single-precision (32-bit) floating-point elements in a and b,
-// and store packed minimum values in dst. dst does not follow the IEEE Standard
-// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-// signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128_f32(
-        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare the lower single-precision (32-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper 3
-// packed elements from a to the upper element of dst. dst does not follow the
-// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-// inputs are NaN or signed-zero values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the lower single-precision (32-bit) floating-point element from b to the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
-                       vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the upper 2 single-precision (32-bit) floating-point elements from b to
-// the lower 2 elements of dst, and copy the upper 2 elements from a to the
-// upper 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
-{
-#if defined(aarch64__)
-    return vreinterpretq_m128_u64(
-        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
-#else
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-#endif
-}
-
-// Move the lower 2 single-precision (32-bit) floating-point elements from b to
-// the upper 2 elements of dst, and copy the lower 2 elements from a to the
-// lower 2 elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
-FORCE_INLINE int _mm_movemask_pi8(__m64 a)
-{
-    uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__)
-    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint8x8_t tmp = vshr_n_u8(input, 7);
-    return vaddv_u8(vshl_u8(tmp, shift));
-#else
-    // Refer the implementation of `_mm_movemask_epi8`
-    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
-    uint32x2_t paired16 =
-        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
-    uint8x8_t paired32 =
-        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
-    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
-#endif
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed single-precision (32-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-    uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
-    uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
-#else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Multiply packed single-precision (32-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_mul_ps(a, b));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(vshrn_n_u32(
-        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
-}
-
-// Compute the bitwise OR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
-#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
-#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
-#define _m_pmovmskb(a) _mm_movemask_pi8(a)
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
-// Fetch the line of data from memory that contains address p to a location in
-// the cache heirarchy specified by the locality hint i.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
-FORCE_INLINE void _mm_prefetch(char const *p, int i)
-{
-    switch (i) {
-    case _MM_HINT_NTA:
-        __builtin_prefetch(p, 0, 0);
-        break;
-    case _MM_HINT_T0:
-        __builtin_prefetch(p, 0, 3);
-        break;
-    case _MM_HINT_T1:
-        __builtin_prefetch(p, 0, 2);
-        break;
-    case _MM_HINT_T2:
-        __builtin_prefetch(p, 0, 1);
-        break;
-    }
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
-#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
-
-// Compute the approximate reciprocal of packed single-precision (32-bit)
-// floating-point elements in a, and store the results in dst. The maximum
-// relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-#endif
-    return vreinterpretq_m128_f32(recip);
-}
-
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
-    return _mm_move_ss(a, _mm_rcp_ps(a));
-}
-
-// Compute the approximate reciprocal square root of packed single-precision
-// (32-bit) floating-point elements in a, and store the results in dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-#if SSE2NEON_PRECISE_SQRT
-    // Additional Netwon-Raphson iteration for accuracy
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-#endif
-    return vreinterpretq_m128_f32(out);
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
-{
-    uint64x1_t t = vpaddl_u32(vpaddl_u16(
-        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
-    return vreinterpret_m64_u16(
-        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
-}
-
-// Macro: Set the flush zero bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The flush zero may contain any of the
-// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
-FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Macro: Set the rounding mode bits of the MXCSR control and status register to
-// the value in unsigned 32-bit integer a. The rounding mode may contain any of
-// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
-// _MM_ROUND_TOWARD_ZERO
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
-        break;
-    case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
-        break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
-    }
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
-}
-
-// Broadcast single-precision (32-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Set the MXCSR control and status register with the value in unsigned 32-bit
-// integer a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
-// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
-FORCE_INLINE void _mm_setcsr(unsigned int a)
-{
-    _MM_SET_ROUNDING_MODE(a);
-}
-
-// Get the unsigned 32-bit value of the MXCSR control and status register.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
-// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
-FORCE_INLINE unsigned int _mm_getcsr()
-{
-    return _MM_GET_ROUNDING_MODE();
-}
-
-// Set packed single-precision (32-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Return vector of type __m128 with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Shuffle 16-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pi16(a, imm)                                           \
-    __extension__({                                                        \
-        vreinterpret_m64_s16(vshuffle_s16(                                 \
-            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
-            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
-    })
-#else
-#define _mm_shuffle_pi16(a, imm)                                               \
-    __extension__({                                                            \
-        int16x4_t ret;                                                         \
-        ret =                                                                  \
-            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
-            1);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
-            2);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
-            3);                                                                \
-        vreinterpret_m64_s16(ret);                                             \
-    })
-#endif
-
-// Perform a serializing operation on all store-to-memory instructions that were
-// issued prior to this instruction. Guarantees that every store instruction
-// that precedes, in program order, is globally visible before any store
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
-FORCE_INLINE void _mm_sfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory and store-to-memory
-// instructions that were issued prior to this instruction. Guarantees that
-// every memory access that precedes, in program order, the memory fence
-// instruction is globally visible before any memory instruction which follows
-// the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
-FORCE_INLINE void _mm_mfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// Perform a serializing operation on all load-from-memory instructions that
-// were issued prior to this instruction. Guarantees that every load instruction
-// that precedes, in program order, is globally visible before any load
-// instruction which follows the fence in program order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
-FORCE_INLINE void _mm_lfence(void)
-{
-    _sse2neon_smp_mb();
-}
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_ps(a, b, imm)                                              \
-    __extension__({                                                            \
-        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
-        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
-        float32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-        vreinterpretq_m128_f32(_shuf);                                         \
-    })
-#else  // generic
-#define _mm_shuffle_ps(a, b, imm)                          \
-    __extension__({                                        \
-        __m128 ret;                                        \
-        switch (imm) {                                     \
-        case _MM_SHUFFLE(1, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_1032((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 3, 0, 1):                      \
-            ret = _mm_shuffle_ps_2301((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 3, 2, 1):                      \
-            ret = _mm_shuffle_ps_0321((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 1, 0, 3):                      \
-            ret = _mm_shuffle_ps_2103((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 1, 0):                      \
-            ret = _mm_movelh_ps((a), (b));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_1001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 1, 0, 1):                      \
-            ret = _mm_shuffle_ps_0101((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 1, 0):                      \
-            ret = _mm_shuffle_ps_3210((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 1, 1):                      \
-            ret = _mm_shuffle_ps_0011((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 2, 2):                      \
-            ret = _mm_shuffle_ps_0022((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 2, 0, 0):                      \
-            ret = _mm_shuffle_ps_2200((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 0, 2):                      \
-            ret = _mm_shuffle_ps_3202((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 3, 2):                      \
-            ret = _mm_movehl_ps((b), (a));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 1, 3, 3):                      \
-            ret = _mm_shuffle_ps_1133((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 1, 0):                      \
-            ret = _mm_shuffle_ps_2010((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_2001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_2032((a), (b));           \
-            break;                                         \
-        default:                                           \
-            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-            break;                                         \
-        }                                                  \
-        ret;                                               \
-    })
-#endif
-
-// Compute the square root of packed single-precision (32-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if SSE2NEON_PRECISE_SQRT
-    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-
-    // Test for vrsqrteq_f32(0) -> positive infinity case.
-    // Change to zero, so that s * 1/sqrt(s) result is zero too.
-    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
-    const uint32x4_t div_by_zero =
-        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
-    recip = vreinterpretq_f32_u32(
-        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
-
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-
-    // sqrt(s) = s * 1/sqrt(s)
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
-#elif defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Compute the square root of the lower single-precision (32-bit) floating-point
-// element in a, store the result in the lower element of dst, and copy the
-// upper 3 packed elements from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
-FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    vst1q_f32(p, vdupq_n_f32(a0));
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Store the lower single-precision (32-bit) floating-point element from a into
-// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
-#define _mm_store1_ps _mm_store_ps1
-
-// Store the upper 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_high_f32(a));
-}
-
-// Store the lower 2 single-precision (32-bit) floating-point elements from a
-// into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_low_f32(a));
-}
-
-// Store 4 single-precision (32-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
-FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
-{
-    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-    float32x4_t rev = vextq_f32(tmp, tmp, 2);
-    vst1q_f32(p, rev);
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores 16-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
-FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
-{
-    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
-}
-
-// Stores 64-bits of integer data a at the address p.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
-FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
-{
-    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
-}
-
-// Store 64-bits of integer data from a into memory using a non-temporal memory
-// hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
-FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
-{
-    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#else
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-#endif
-}
-
-// Subtract packed single-precision (32-bit) floating-point elements in b from
-// packed single-precision (32-bit) floating-point elements in a, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_sub_ps(a, b));
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-    do {                                                  \
-        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
-        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
-                            vget_low_f32(ROW23.val[0]));  \
-        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
-                            vget_low_f32(ROW23.val[1]));  \
-        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
-                            vget_high_f32(ROW23.val[0])); \
-        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
-                            vget_high_f32(ROW23.val[1])); \
-    } while (0)
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-// Return vector of type __m128i with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
-FORCE_INLINE __m128i _mm_undefined_si128(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128i a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Return vector of type __m128 with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
-FORCE_INLINE __m128 _mm_undefined_ps(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128 a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the high half a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave single-precision (32-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-/* SSE2 */
-
-// Add packed 16-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed 32-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Add packed 64-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Add packed 8-bit integers in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add the lower double-precision (64-bit) floating-point element in a and b,
-// store the result in the lower element of dst, and copy the upper element from
-// a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
-FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_add_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add 64-bit integers a and b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Add packed signed 16-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Add packed unsigned 16-bit integers in a and b using saturation, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Add packed unsigned 8-bit integers in a and b using saturation, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-{
-    // *NOTE* argument swap
-    return vreinterpretq_m128d_s64(
-        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
-}
-
-// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
-// AND with b, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vbicq_s32(vreinterpretq_s32_m128i(b),
-                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
-#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
-#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
-
-// Cast vector of type __m128d to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
-FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
-{
-    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
-    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
-{
-    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
-FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
-#else
-    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
-#endif
-}
-
-// Cast vector of type __m128i to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Invalidate and flush the cache line that contains p from all levels of the
-// cache hierarchy.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
-#if defined(__APPLE__)
-#include <libkern/OSCacheControl.h>
-#endif
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-    (void) p;
-
-    /* sys_icache_invalidate is supported since macOS 10.5.
-     * However, it does not work on non-jailbroken iOS devices, although the
-     * compilation is successful.
-     */
-#if defined(__APPLE__)
-    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
-#elif defined(__GNUC__) || defined(__clang__)
-    uintptr_t ptr = (uintptr_t) p;
-    __builtin___clear_cache((char *) ptr,
-                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
-#else
-    /* FIXME: MSVC support */
-#endif
-}
-
-// Compare packed 16-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 8-bit integers in a and b for equality, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for equality, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
-FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for equality, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
-FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
-FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
-FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for greater-than, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
-FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for greater-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
-FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
-FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than-or-equal, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
-FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmple_pd(a, b));
-#else
-    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b for less-than, and store the
-// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
-// order of the operands switched.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
-FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for less-than, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
-FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
-FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
-#else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-equal, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
-FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
-FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than-or-equal, store the result in the lower element of
-// dst, and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
-FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-greater-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
-FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-greater-than, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
-FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than-or-equal, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
-FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than-or-equal, store the result in the lower element of dst,
-// and copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
-FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// for not-less-than, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
-FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_u64(veorq_u64(
-        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
-        vdupq_n_u64(UINT64_MAX)));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b for not-less-than, store the result in the lower element of dst, and copy
-// the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
-FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if neither is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
-FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Excluding NaNs, any two floating point numbers can be compared.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if neither is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
-FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b
-// to see if either is NaN, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
-FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    // Two NaNs are not equal in comparison operation.
-    uint64x2_t not_nan_a =
-        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
-    uint64x2_t not_nan_b =
-        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_s32(
-        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b to see if either is NaN, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
-FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = a1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
-FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for greater-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
-FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 > *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than-or-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
-FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 <= *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for less-than, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
-FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 < *(double *) &b0);
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for equality, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
-FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
-#else
-    uint32x4_t a_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
-    uint32x4_t b_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
-                                       vreinterpretq_u64_u32(a_eq_b));
-    return vgetq_lane_u64(and_results, 0) & 0x1;
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point element in a and b
-// for not-equal, and return the boolean result (0 or 1).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
-FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
-{
-    return !_mm_comieq_sd(a, b);
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
-FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
-#else
-    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
-FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
-{
-// vrnd32xq_f64 not supported on clang
-#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
-    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
-    int64x2_t integers = vcvtq_s64_f64(rounded);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
-FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
-{
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-{
-#if defined(__aarch64__)
-    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
-    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
-#else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
-#endif
-}
-
-// Convert packed signed 32-bit integers in a to packed double-precision
-// (64-bit) floating-point elements, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
-FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
-#else
-    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
-    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__ARM_FEATURE_FRINT)
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST:
-        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-    case _MM_ROUND_DOWN:
-        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
-    case _MM_ROUND_UP:
-        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
-    }
-#else
-    float *f = (float *) &a;
-    switch (_MM_GET_ROUNDING_MODE()) {
-    case _MM_ROUND_NEAREST: {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128i_s32(
-            vbslq_s32(is_delta_half, r_even, r_normal));
-    }
-    case _MM_ROUND_DOWN:
-        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
-                             floorf(f[0]));
-    case _MM_ROUND_UP:
-        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
-                             ceilf(f[0]));
-    default:  // _MM_ROUND_TOWARD_ZERO
-        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
-                             (int32_t) f[0]);
-    }
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
-#else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
-#else
-    return ((double *) &a)[0];
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
-FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int32_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
-FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
-#define _mm_cvtsd_si64x _mm_cvtsd_si64
-
-// Convert the lower double-precision (64-bit) floating-point element in b to a
-// single-precision (32-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper 3 packed elements from a to the
-// upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
-FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsetq_lane_f32(
-        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
-        vreinterpretq_f32_m128(a), 0));
-#else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
-#endif
-}
-
-// Copy the lower 32-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
-{
-    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Convert the signed 32-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
-FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Copy 32-bit integer a to the lower elements of dst, and zero the upper
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
-FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
-#else
-    double bf = (double) b;
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
-    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Copy 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
-#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
-
-// Convert the signed 64-bit integer b to a double-precision (64-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
-#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
-
-// Convert the lower single-precision (32-bit) floating-point element in b to a
-// double-precision (64-bit) floating-point element, store the result in the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
-FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
-{
-    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
-#endif
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
-FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
-FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
-{
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
-    return vreinterpret_m64_s32(vld1_s32(data));
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
-FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
-{
-    double ret = *((double *) &a);
-    return (int32_t) ret;
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
-FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
-#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
-// Divide packed double-precision (64-bit) floating-point elements in a by
-// packed elements in b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
-FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Divide the lower double-precision (64-bit) floating-point element in a by the
-// lower double-precision (64-bit) floating-point element in b, store the result
-// in the lower element of dst, and copy the upper element from a to the upper
-// element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
-FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    float64x2_t tmp =
-        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
-#else
-    return _mm_move_sd(a, _mm_div_pd(a, b));
-#endif
-}
-
-// Extract a 16-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
-
-// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64(p));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
-#else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
-#endif
-}
-
-// Load 64-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
-    /* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
-#else
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-#endif
-}
-
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
-#if defined(__aarch64__)
-    float64x2_t v = vld1q_f64(p);
-    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
-#else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
-    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
-#endif
-}
-
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
-{
-    return _mm_load_pd(p);
-}
-
-// Load 128-bits of integer data from memory into dst. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load unaligned 32-bit integer from memory into the first element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
-{
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
-// 32-bit integers, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
-    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                              vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__)
-    int32x4_t high =
-        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
-
-    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
-#else
-    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                               vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-#endif
-}
-
-// Conditionally store 8-bit integer elements from a into memory using mask
-// (elements are not stored when the highest bit is not set in the corresponding
-// element) and a non-temporal memory hint. mem_addr does not need to be aligned
-// on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
-FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
-{
-    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
-    int8x16_t masked =
-        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
-                 vreinterpretq_s8_m128(b));
-    vst1q_s8((int8_t *) mem_addr, masked);
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed maximum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
-FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
-
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the maximum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
-FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_max_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Compare packed double-precision (64-bit) floating-point elements in a and b,
-// and store packed minimum values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
-FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-#if SSE2NEON_PRECISE_MINMAX
-    float64x2_t _a = vreinterpretq_f64_m128d(a);
-    float64x2_t _b = vreinterpretq_f64_m128d(b);
-    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
-#else
-    return vreinterpretq_m128d_f64(
-        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#endif
-#else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
-#endif
-}
-
-// Compare the lower double-precision (64-bit) floating-point elements in a and
-// b, store the minimum value in the lower element of dst, and copy the upper
-// element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
-FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_min_pd(a, b));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
-#endif
-}
-
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
-}
-
-// Move the lower double-precision (64-bit) floating-point element from b to the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
-FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-}
-
-// Create mask from the most significant bit of each 8-bit element in a, and
-// store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-    // Shift out everything but the sign bits with an unsigned shift right.
-    //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
-    //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
-    //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
-    //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
-    //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
-    //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
-}
-
-// Set each bit of mask dst based on the most significant bit of the
-// corresponding packed double-precision (64-bit) floating-point element in a.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
-FORCE_INLINE int _mm_movemask_pd(__m128d a)
-{
-    uint64x2_t input = vreinterpretq_u64_m128d(a);
-    uint64x2_t high_bits = vshrq_n_u64(input, 63);
-    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-{
-    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-}
-
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
-    // vmull_u32 upcasts instead of masking, so we downcast.
-    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply packed double-precision (64-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
-FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Multiply the lower double-precision (64-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper element
-// from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
-FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_mul_pd(a, b));
-}
-
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u64(vget_low_u64(
-        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
-}
-
-// Multiply the packed signed 16-bit integers in a and b, producing intermediate
-// 32-bit integers, and store the high 16 bits of the intermediate integers in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-    /* FIXME: issue with large values because of result saturation */
-    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
-FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
-{
-    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__)
-    uint32x4_t ab7654 =
-        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
-                              vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r);
-#else
-    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-#endif
-}
-
-// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
-// integers, and store the low 16 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compute the bitwise OR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
-FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using signed saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Pause the processor. This is typically used in spin-wait loops and depending
-// on the x86 processor typical values are in the 40-100 cycle range. The
-// 'yield' instruction isn't a good fit because it's effectively a nop on most
-// Arm cores. Experience with several databases has shown has shown an 'isb' is
-// a reasonable approximation.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
-FORCE_INLINE void _mm_pause()
-{
-    __asm__ __volatile__("isb\n");
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
-}
-
-// Set packed 16-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                   short i6,
-                                   short i5,
-                                   short i4,
-                                   short i3,
-                                   short i2,
-                                   short i1,
-                                   short i0)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-    return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
-{
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
-}
-
-// Set packed 64-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
-}
-
-// Set packed 8-bit integers in dst with the supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                  signed char b14,
-                                  signed char b13,
-                                  signed char b12,
-                                  signed char b11,
-                                  signed char b10,
-                                  signed char b9,
-                                  signed char b8,
-                                  signed char b7,
-                                  signed char b6,
-                                  signed char b5,
-                                  signed char b4,
-                                  signed char b3,
-                                  signed char b2,
-                                  signed char b1,
-                                  signed char b0)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-{
-    double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
-#define _mm_set_pd1 _mm_set1_pd
-
-// Copy double-precision (64-bit) floating-point element a to the lower element
-// of dst, and zero the upper element.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
-FORCE_INLINE __m128d _mm_set_sd(double a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
-#else
-    return _mm_set_pd(0, a);
-#endif
-}
-
-// Broadcast 16-bit integer a to all all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Broadcast 32-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
-}
-
-// Broadcast 64-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Broadcast 8-bit integer a to all elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
-    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
-FORCE_INLINE __m128d _mm_set1_pd(double d)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
-#endif
-}
-
-// Set packed 16-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                    short w1,
-                                    short w2,
-                                    short w3,
-                                    short w4,
-                                    short w5,
-                                    short w6,
-                                    short w7)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
-}
-
-// Set packed 32-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
-{
-    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
-}
-
-// Set packed 8-bit integers in dst with the supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                   signed char b1,
-                                   signed char b2,
-                                   signed char b3,
-                                   signed char b4,
-                                   signed char b5,
-                                   signed char b6,
-                                   signed char b7,
-                                   signed char b8,
-                                   signed char b9,
-                                   signed char b10,
-                                   signed char b11,
-                                   signed char b12,
-                                   signed char b13,
-                                   signed char b14,
-                                   signed char b15)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values in reverse order.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
-FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
-{
-    return _mm_set_pd(e0, e1);
-}
-
-// Return vector of type __m128d with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
-FORCE_INLINE __m128d _mm_setzero_pd(void)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
-#else
-    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
-#endif
-}
-
-// Return vector of type __m128i with all elements set to zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Shuffle 32-bit integers in a using the control in imm8, and store the results
-// in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_epi32(a, imm)                                            \
-    __extension__({                                                          \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
-        int32x4_t _shuf =                                                    \
-            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                                      \
-    })
-#else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
-#endif
-
-// Shuffle double-precision (64-bit) floating-point elements using the control
-// in imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
-#ifdef _sse2neon_shuffle
-#define _mm_shuffle_pd(a, b, imm8)                                            \
-    vreinterpretq_m128d_s64(                                                  \
-        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
-#else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflehi_epi16(a, imm)                                           \
-    __extension__({                                                           \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
-        int16x8_t _shuf =                                                     \
-            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-                          (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                                       \
-    })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
-#define _mm_shufflelo_epi16(a, imm)                                  \
-    __extension__({                                                  \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = vshuffleq_s16(                             \
-            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
-            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
-        vreinterpretq_m128i_s16(_shuf);                              \
-    })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shift packed 16-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
-    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
-    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a left by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
-    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
-FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~15))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
-}
-
-// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~31))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s32(
-        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
-}
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-{
-    if (_sse2neon_unlikely(imm & ~63))
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s64(
-        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
-}
-
-// Shift a left by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                         \
-    __extension__({                                                    \
-        int8x16_t ret;                                                 \
-        if (_sse2neon_unlikely(imm == 0))                              \
-            ret = vreinterpretq_s8_m128i(a);                           \
-        else if (_sse2neon_unlikely((imm) & ~15))                      \
-            ret = vdupq_n_s8(0);                                       \
-        else                                                           \
-            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
-                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
-        vreinterpretq_m128i_s8(ret);                                   \
-    })
-
-// Compute the square root of packed double-precision (64-bit) floating-point
-// elements in a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
-FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Compute the square root of the lower double-precision (64-bit) floating-point
-// element in b, store the result in the lower element of dst, and copy the
-// upper element from a to the upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
-FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return _mm_move_sd(a, _mm_sqrt_pd(b));
-#else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
-#endif
-}
-
-// Shift packed 16-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in sign
-// bits, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-{
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
-}
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) == 0)) {                                \
-            ret = a;                                                         \
-        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 16-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
-    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shift packed 32-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
-    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shift packed 64-bit integers in a right by count while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
-    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~15)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u16(                                   \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~31)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u32(                                   \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~63)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u64(                                   \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
-
-// Shift a right by imm8 bytes while shifting in zeros, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
-#define _mm_srli_si128(a, imm)                                       \
-    __extension__({                                                  \
-        int8x16_t ret;                                               \
-        if (_sse2neon_unlikely((imm) & ~15))                         \
-            ret = vdupq_n_s8(0);                                     \
-        else                                                         \
-            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
-                           (imm > 15 ? 0 : imm));                    \
-        vreinterpretq_m128i_s8(ret);                                 \
-    })
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
-FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
-    vst1q_f64((float64_t *) mem_addr,
-              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
-#else
-    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
-    vst1q_f32((float32_t *) mem_addr,
-              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory. mem_addr does not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
-FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr must be aligned
-// on a 16-byte boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
-#define _mm_store1_pd _mm_store_pd1
-
-// Store the upper double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
-FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 64-bit integer from the first element of a into memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
-    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// memory.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
-FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
-#else
-    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
-#endif
-}
-
-// Store 2 double-precision (64-bit) floating-point elements from a into memory
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
-FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
-{
-    float32x4_t f = vreinterpretq_f32_m128d(a);
-    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
-{
-    _mm_store_pd(mem_addr, a);
-}
-
-// Store 128-bits of integer data from a into memory. mem_addr does not need to
-// be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Store 32-bit integer from the first element of a into memory. mem_addr does
-// not need to be aligned on any particular boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
-FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
-{
-    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory using a non-temporal memory hint. mem_addr must
-// be aligned on a 16-byte boundary or a general-protection exception may be
-// generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
-FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#elif defined(__aarch64__)
-    vst1q_f64(p, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
-#endif
-}
-
-// Store 128-bits of integer data from a into memory using a non-temporal memory
-// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
-// exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, p);
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Store 32-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
-FORCE_INLINE void _mm_stream_si32(int *p, int a)
-{
-    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
-}
-
-// Store 64-bit integer a into memory using a non-temporal hint to minimize
-// cache pollution. If the cache line containing address mem_addr is already in
-// the cache, the cache will be updated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
-FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
-{
-    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
-}
-
-// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed double-precision (64-bit) floating-point elements in b from
-// packed double-precision (64-bit) floating-point elements in a, and store the
-// results in dst.
-//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
-FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Subtract the lower double-precision (64-bit) floating-point element in b from
-// the lower double-precision (64-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
-FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_sub_pd(a, b));
-}
-
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
-// using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
-// integers in a using saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-#define _mm_ucomieq_sd _mm_comieq_sd
-#define _mm_ucomige_sd _mm_comige_sd
-#define _mm_ucomigt_sd _mm_comigt_sd
-#define _mm_ucomile_sd _mm_comile_sd
-#define _mm_ucomilt_sd _mm_comilt_sd
-#define _mm_ucomineq_sd _mm_comineq_sd
-
-// Return vector of type __m128d with undefined elements.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
-FORCE_INLINE __m128d _mm_undefined_pd(void)
-{
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-    __m128d a;
-    return a;
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the high half of a and b, and
-// store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the high half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the high half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
-FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
-                     vget_high_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Unpack and interleave 16-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 32-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave 64-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s64(
-        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-#endif
-}
-
-// Unpack and interleave 8-bit integers from the low half of a and b, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Unpack and interleave double-precision (64-bit) floating-point elements from
-// the low half of a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
-FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    return vreinterpretq_m128d_s64(
-        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
-                     vget_low_s64(vreinterpretq_s64_m128d(b))));
-#endif
-}
-
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
-// and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-/* SSE3 */
-
-// Alternatively add and subtract packed double-precision (64-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
-FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
-{
-    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
-                                             vreinterpretq_f64_m128d(b),
-                                             vreinterpretq_f64_m128d(mask)));
-#else
-    return _mm_add_pd(_mm_mul_pd(b, mask), a);
-#endif
-}
-
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-{
-    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
-                                            vreinterpretq_f32_m128(mask),
-                                            vreinterpretq_f32_m128(b)));
-#else
-    return _mm_add_ps(_mm_mul_ps(b, mask), a);
-#endif
-}
-
-// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
-FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
-// elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of double-precision (64-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
-{
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
-#else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
-{
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
-#else
-    float32x4x2_t c = vuzpq_f32(a, b);
-    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
-}
-
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
-
-// Duplicate the low double-precision (64-bit) floating-point element from a,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
-FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
-#else
-    return vreinterpretq_m128d_u64(
-        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
-#endif
-}
-
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
-#else
-    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
-    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
-#elif defined(_sse2neon_shuffle)
-    return vreinterpretq_m128_f32(vshuffleq_s32(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
-    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-/* SSSE3 */
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
-    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
-    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-{
-    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-{
-    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
-    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
-
-// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 16 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
-        __m128i ret;                                                          \
-        if (_sse2neon_unlikely((imm) & ~31))                                  \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
-        else if (imm >= 16)                                                   \
-            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
-        else                                                                  \
-            ret =                                                             \
-                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
-        ret;                                                                  \
-    })
-
-// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
-// the result right by imm8 bytes, and store the low 8 bytes in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
-#define _mm_alignr_pi8(a, b, imm)                                           \
-    __extension__({                                                         \
-        __m64 ret;                                                          \
-        if (_sse2neon_unlikely((imm) >= 16)) {                              \
-            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
-        } else {                                                            \
-            uint8x8_t tmp_low, tmp_high;                                    \
-            if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
-                tmp_low = vreinterpret_u8_m64(a);                           \
-                tmp_high = vdup_n_u8(0);                                    \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            } else {                                                        \
-                const int idx = (imm);                                      \
-                tmp_low = vreinterpret_u8_m64(b);                           \
-                tmp_high = vreinterpret_u8_m64(a);                          \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            }                                                               \
-        }                                                                   \
-        ret;                                                                \
-    })
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
-#else
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-#endif
-}
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s32(
-        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
-}
-
-// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-// saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
-FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t res = vuzp_s16(a, b);
-    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
-#else
-    int32x4x2_t c = vuzpq_s32(a, b);
-    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
-// the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
-FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
-// the signed 32-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
-FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
-#else
-    int32x2x2_t c = vuzp_s32(a, b);
-    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int16x8x2_t c = vuzpq_s16(a, b);
-    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-// using saturation, and pack the signed 16-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
-FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
-    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
-#else
-    int16x4x2_t c = vuzp_s16(a, b);
-    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                             vmovl_s8(vget_low_s8(b)));
-    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
-                             vmovl_s8(vget_high_s8(b)));
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
-#else
-    // This would be much simpler if x86 would choose to zero extend OR sign
-    // extend, not both. This could probably be optimized better.
-    uint16x8_t a = vreinterpretq_u16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // Zero extend a
-    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-    int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-    // multiply
-    int16x8_t prod1 = vmulq_s16(a_even, b_even);
-    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-#endif
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
-// pack the saturated results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
-FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
-{
-    uint16x4_t a = vreinterpret_u16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // Zero extend a
-    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
-    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
-    int16x4_t b_odd = vshr_n_s16(b, 8);
-
-    // multiply
-    int16x4_t prod1 = vmul_s16(a_even, b_even);
-    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
-    // Has issues due to saturation
-    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-    // Multiply
-    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    // Rounding narrowing shift right
-    // narrow = (int16_t)((mul + 16384) >> 15);
-    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-    // Join together
-    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Truncate each intermediate integer to the 18 most
-// significant bits, round by adding 1, and store bits [16:1] to dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
-FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
-{
-    int32x4_t mul_extend =
-        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
-
-    // Rounding narrowing shift right
-    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
-    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
-    uint8x16_t idx_masked =
-        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-    int8x16_t ret;
-    // %e and %f represent the even and odd D registers
-    // respectively.
-    __asm__ __volatile__(
-        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-        : [ret] "=&w"(ret)
-        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-    return vreinterpretq_m128i_s8(ret);
-#else
-    // use this line if testing on aarch64
-    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
-FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
-{
-    const int8x8_t controlMask =
-        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
-    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
-    return vreinterpret_m64_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
-#else
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
-    // 'a') based on ltMask
-    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x8_t res = vbicq_s16(masked, zeroMask);
-    return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
-#else
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
-    // 'a') based on ltMask
-    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x4_t res = vbicq_s32(masked, zeroMask);
-    return vreinterpretq_m128i_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
-    int8x16_t a = vreinterpretq_s8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
-#else
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
-    // based on ltMask
-    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x16_t res = vbicq_s8(masked, zeroMask);
-
-    return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
-
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
-#else
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
-    // based on ltMask
-    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x4_t res = vbic_s16(masked, zeroMask);
-
-    return vreinterpret_m64_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
-#else
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
-    // based on ltMask
-    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x2_t res = vbic_s32(masked, zeroMask);
-
-    return vreinterpret_m64_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-{
-    int8x8_t a = vreinterpret_s8_m64(_a);
-    int8x8_t b = vreinterpret_s8_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
-#else
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
-    // based on ltMask
-    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x8_t res = vbic_s8(masked, zeroMask);
-
-    return vreinterpret_m64_s8(res);
-}
-
-/* SSE4.1 */
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                            \
-    __extension__({                                                           \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
-                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
-    })
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using control mask imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
-#define _mm_blend_pd(a, b, imm)                                \
-    __extension__({                                            \
-        const uint64_t _mask[2] = {                            \
-            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
-            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
-        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
-        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
-        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
-        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
-    })
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
-FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
-{
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
-    uint32x4_t mask = vld1q_u32(data);
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint8x16_t mask =
-        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    uint8x16_t b = vreinterpretq_u8_m128i(_b);
-    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
-FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
-{
-    uint64x2_t mask =
-        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
-#else
-    uint64x2_t a = vreinterpretq_u64_m128d(_a);
-    uint64x2_t b = vreinterpretq_u64_m128d(_b);
-    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
-#endif
-}
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint32x4_t mask =
-        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a up
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
-FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b up to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
-FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_ceil_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b up to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
-FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_ceil_ps(b));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
-    // ARMv7 lacks vceqq_u64
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
-    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
-// the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
-// integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_u32(
-        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
-    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_u64(
-        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
-    return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
-// and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
-// 64-bit integers, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Conditionally multiply the packed double-precision (64-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products, and
-// conditionally store the sum in dst using the low 4 bits of imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
-FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
-{
-    // Generate mask value from constant immediate bit value
-    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
-    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
-#if !SSE2NEON_PRECISE_DP
-    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
-    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
-#endif
-    // Conditional multiplication
-#if !SSE2NEON_PRECISE_DP
-    __m128d mul = _mm_mul_pd(a, b);
-    const __m128d mulMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
-    __m128d tmp = _mm_and_pd(mul, mulMask);
-#else
-#if defined(__aarch64__)
-    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
-                             : 0;
-    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
-                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
-                             : 0;
-#else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
-#endif
-    __m128d tmp = _mm_set_pd(d1, d0);
-#endif
-    // Sum the products
-#if defined(__aarch64__)
-    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
-#else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
-#endif
-    // Conditionally store the sum
-    const __m128d sumMask =
-        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
-    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
-    return res;
-}
-
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-{
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
-    if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
-    }
-#endif
-
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
-
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
-    if (imm & (1 << 4))
-        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
-    if (imm & (1 << 5))
-        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
-    if (imm & (1 << 6))
-        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
-    if (imm & (1 << 7))
-        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
-    };
-    return vreinterpretq_m128_f32(res);
-}
-
-// Extract a 32-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extract a 64-bit integer from a, selected with imm8, and store the result in
-// dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Extract an 8-bit integer from a, selected with imm8, and store the result in
-// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
-// __constrange(0,16) int imm)
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Round the packed double-precision (64-bit) floating-point elements in a down
-// to an integer value, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
-FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
-#else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
-#else
-    float *f = (float *) &a;
-    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b down to
-// an integer value, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
-FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
-{
-    return _mm_move_sd(a, _mm_floor_pd(b));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b down to
-// an integer value, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
-FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_floor_ps(b));
-}
-
-// Copy a to dst, and insert the 32-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the 64-bit integer i into dst at the location
-// specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
-// location specified by imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
-
-// Copy a to tmp, then insert a single-precision (32-bit) floating-point
-// element from b into tmp using the control in imm8. Store tmp to dst using
-// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
-#define _mm_insert_ps(a, b, imm8)                                              \
-    __extension__({                                                            \
-        float32x4_t tmp1 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
-                           vreinterpretq_f32_m128(a), 0);                      \
-        float32x4_t tmp2 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
-                           ((imm8 >> 4) & 0x3));                               \
-        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
-        uint32x4_t mask = vld1q_u32(data);                                     \
-        float32x4_t all_zeros = vdupq_n_f32(0);                                \
-                                                                               \
-        vreinterpretq_m128_f32(                                                \
-            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
-    })
-
-// Compare packed signed 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
-FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Compare packed signed 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
-FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
-FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
-    __m128i dst;
-    uint16_t min, idx = 0;
-#if defined(__aarch64__)
-    // Find the minimum value
-    min = vminvq_u16(vreinterpretq_u16_m128i(a));
-
-    // Get the index of the minimum value
-    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint16x8_t minv = vdupq_n_u16(min);
-    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
-    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
-#else
-    // Find the minimum value
-    __m64 tmp;
-    tmp = vreinterpret_m64_u16(
-        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-                 vget_high_u16(vreinterpretq_u16_m128i(a))));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-    // Get the index of the minimum value
-    int i;
-    for (i = 0; i < 8; i++) {
-        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-            idx = (uint16_t) i;
-            break;
-        }
-        a = _mm_srli_si128(a, 2);
-    }
-#endif
-    // Generate result
-    dst = _mm_setzero_si128();
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-    return dst;
-}
-
-// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
-// 8-bit integers in a compared to those in b, and store the 16-bit results in
-// dst. Eight SADs are performed using one quadruplet from b and eight
-// quadruplets from a. One quadruplet is selected from b starting at on the
-// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
-// integers selected from a starting at the offset specified in imm8.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
-FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
-{
-    uint8x16_t _a, _b;
-
-    switch (imm & 0x4) {
-    case 0:
-        // do nothing
-        _a = vreinterpretq_u8_m128i(a);
-        break;
-    case 4:
-        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
-                                            vreinterpretq_u32_m128i(a), 1));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    switch (imm & 0x3) {
-    case 0:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
-        break;
-    case 1:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
-        break;
-    case 2:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
-        break;
-    case 3:
-        _b = vreinterpretq_u8_u32(
-            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
-        break;
-    default:
-#if defined(__GNUC__) || defined(__clang__)
-        __builtin_unreachable();
-#endif
-        break;
-    }
-
-    int16x8_t c04, c15, c26, c37;
-    uint8x8_t low_b = vget_low_u8(_b);
-    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
-    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
-    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
-    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
-    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
-    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
-    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__)
-    // |0|4|2|6|
-    c04 = vpaddq_s16(c04, c26);
-    // |1|5|3|7|
-    c15 = vpaddq_s16(c15, c37);
-
-    int32x4_t trn1_c =
-        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    int32x4_t trn2_c =
-        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
-    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
-                                              vreinterpretq_s16_s32(trn2_c)));
-#else
-    int16x4_t c01, c23, c45, c67;
-    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
-    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
-    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
-    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
-
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
-#endif
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
-    // vmull_s32 upcasts instead of masking, so we downcast.
-    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
-// integers, and store the low 32 bits of the intermediate integers in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-// using unsigned saturation, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Round the packed double-precision (64-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed double-precision
-// floating-point elements in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
-FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
-{
-#if defined(__aarch64__)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_pd(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_pd(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
-    }
-#else
-    double *v_double = (double *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        double res[2], tmp;
-        for (int i = 0; i < 2; i++) {
-            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
-            double roundDown = floor(tmp);  // Round down value
-            double roundUp = ceil(tmp);     // Round up value
-            double diffDown = tmp - roundDown;
-            double diffUp = roundUp - tmp;
-            if (diffDown < diffUp) {
-                /* If it's closer to the round down value, then use it */
-                res[i] = roundDown;
-            } else if (diffDown > diffUp) {
-                /* If it's closer to the round up value, then use it */
-                res[i] = roundUp;
-            } else {
-                /* If it's equidistant between round up and round down value,
-                 * pick the one which is an even number */
-                double half = roundDown / 2;
-                if (half != floor(half)) {
-                    /* If the round down value is odd, return the round up value
-                     */
-                    res[i] = roundUp;
-                } else {
-                    /* If the round up value is odd, return the round down value
-                     */
-                    res[i] = roundDown;
-                }
-            }
-            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
-        }
-        return _mm_set_pd(res[1], res[0]);
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_pd(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_pd(a);
-    }
-    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
-                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
-// floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
-{
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return _mm_floor_ps(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return _mm_ceil_ps(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
-    }
-#else
-    float *v_float = (float *) &a;
-
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
-        (rounding == _MM_FROUND_CUR_DIRECTION &&
-         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
-        uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-        float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-        uint32x4_t is_delta_half =
-            vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128_f32(
-            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
-        return _mm_floor_ps(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
-               (rounding == _MM_FROUND_CUR_DIRECTION &&
-                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
-        return _mm_ceil_ps(a);
-    }
-    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
-                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
-                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
-                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
-#endif
-}
-
-// Round the lower double-precision (64-bit) floating-point element in b using
-// the rounding parameter, store the result as a double-precision floating-point
-// element in the lower element of dst, and copy the upper element from a to the
-// upper element of dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
-FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
-{
-    return _mm_move_sd(a, _mm_round_pd(b, rounding));
-}
-
-// Round the lower single-precision (32-bit) floating-point element in b using
-// the rounding parameter, store the result as a single-precision floating-point
-// element in the lower element of dst, and copy the upper 3 packed elements
-// from a to the upper elements of dst. Rounding is done according to the
-// rounding[3:0] parameter, which can be one of:
-//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
-//     suppress exceptions
-//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
-//     exceptions
-//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
-//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
-//     _MM_SET_ROUNDING_MODE
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
-FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
-{
-    return _mm_move_ss(a, _mm_round_ps(b, rounding));
-}
-
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    return __builtin_nontemporal_load(p);
-#else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
-#endif
-}
-
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
-{
-    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
-    int64x2_t a_and_mask =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
-    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
-// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
-// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
-FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
-{
-    uint64x2_t zf =
-        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t cf =
-        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t result = vandq_u64(zf, cf);
-    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the CF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
-FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
-// otherwise return 0.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
-#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the ZF value.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
-FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-/* SSE4.2 */
-
-const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-};
-
-/* specify the source data format */
-#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
-#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
-#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
-#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
-
-/* specify the comparison operation */
-#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
-#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
-#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
-#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
-
-/* specify the polarity */
-#define _SIDD_POSITIVE_POLARITY 0x00
-#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
-#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
-#define _SIDD_MASKED_NEGATIVE_POLARITY \
-    0x30 /* negate results only before end of string */
-
-/* specify the output selection in _mm_cmpXstri */
-#define _SIDD_LEAST_SIGNIFICANT 0x00
-#define _SIDD_MOST_SIGNIFICANT 0x40
-
-/* specify the output selection in _mm_cmpXstrm */
-#define _SIDD_BIT_MASK 0x00
-#define _SIDD_UNIT_MASK 0x40
-
-/* Pattern Matching for C macros.
- * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
- */
-
-/* catenate */
-#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
-#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
-
-#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
-/* run the 2nd parameter */
-#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
-/* run the 1st parameter */
-#define SSE2NEON_IIF_1(t, ...) t
-
-#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
-#define SSE2NEON_COMPL_0 1
-#define SSE2NEON_COMPL_1 0
-
-#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
-#define SSE2NEON_DEC_1 0
-#define SSE2NEON_DEC_2 1
-#define SSE2NEON_DEC_3 2
-#define SSE2NEON_DEC_4 3
-#define SSE2NEON_DEC_5 4
-#define SSE2NEON_DEC_6 5
-#define SSE2NEON_DEC_7 6
-#define SSE2NEON_DEC_8 7
-#define SSE2NEON_DEC_9 8
-#define SSE2NEON_DEC_10 9
-#define SSE2NEON_DEC_11 10
-#define SSE2NEON_DEC_12 11
-#define SSE2NEON_DEC_13 12
-#define SSE2NEON_DEC_14 13
-#define SSE2NEON_DEC_15 14
-#define SSE2NEON_DEC_16 15
-
-/* detection */
-#define SSE2NEON_CHECK_N(x, n, ...) n
-#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
-#define SSE2NEON_PROBE(x) x, 1,
-
-#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
-#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
-
-#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
-#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
-
-#define SSE2NEON_EAT(...)
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
-
-/* recursion */
-/* deferred expression */
-#define SSE2NEON_EMPTY()
-#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
-#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
-#define SSE2NEON_EXPAND(...) __VA_ARGS__
-
-#define SSE2NEON_EVAL(...) \
-    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
-#define SSE2NEON_EVAL1(...) \
-    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
-#define SSE2NEON_EVAL2(...) \
-    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
-#define SSE2NEON_EVAL3(...) __VA_ARGS__
-
-#define SSE2NEON_REPEAT(count, macro, ...)                         \
-    SSE2NEON_WHEN(count)                                           \
-    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
-        SSE2NEON_DEC(count), macro,                                \
-        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
-                                              __VA_ARGS__))
-#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
-
-#define SSE2NEON_SIZE_OF_byte 8
-#define SSE2NEON_NUMBER_OF_LANES_byte 16
-#define SSE2NEON_SIZE_OF_word 16
-#define SSE2NEON_NUMBER_OF_LANES_word 8
-
-#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
-    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
-        vreinterpretq_##type##_m128i(a)));
-
-#define SSE2NEON_FILL_LANE(i, type) \
-    vec_b[i] =                      \
-        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
-
-#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
-                       number_of_lanes, byte_or_word)                         \
-    do {                                                                      \
-        SSE2NEON_CAT(                                                         \
-            data_type_prefix,                                                 \
-            SSE2NEON_CAT(size,                                                \
-                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
-        vec_b[number_of_lanes];                                               \
-        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
-            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
-            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
-                                      SSE2NEON_CAT(type_prefix, size)))       \
-        for (int i = 0; i < number_of_lanes; i++) {                           \
-            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
-                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
-                SSE2NEON_CAT(vreinterpretq_u,                                 \
-                             SSE2NEON_CAT(size, _m128i))(mask),               \
-                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a))))),        \
-                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
-                    vec_b[i],                                                 \
-                    SSE2NEON_CAT(                                             \
-                        vreinterpretq_,                                       \
-                        SSE2NEON_CAT(type_prefix,                             \
-                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
-        }                                                                     \
-    } while (0)
-
-#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
-    do {                                                                     \
-        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
-                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
-                                      SSE2NEON_CAT(u, size)))                \
-    } while (0)
-
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
-    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
-                                                int lb)                       \
-    {                                                                         \
-        __m128i mtx[16];                                                      \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
-        return SSE2NEON_CAT(                                                  \
-            _sse2neon_aggregate_equal_any_,                                   \
-            SSE2NEON_CAT(                                                     \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
-                                             type))))(la, lb, mtx);           \
-    }
-
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
-    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
-                                                 int lb)                       \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_RANGES(                                                        \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_ranges_,                                       \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
-                                             type))))(la, lb, mtx);            \
-    }
-
-#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
-                                                    __m128i b, int lb)         \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_equal_ordered_,                                \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x,                                                \
-                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
-    }
-
-static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
-    }
-    return res;
-}
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
-    prefix##IMPL(byte) \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
-
-static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint16x8_t vec =
-        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        __m128i tmp = vreinterpretq_m128i_u32(
-            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
-        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
-                                       vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__)
-        int t = vaddvq_u32(vec_res) ? 1 : 0;
-#else
-        uint64x2_t sumh = vpaddlq_u32(vec_res);
-        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
-#endif
-        res |= (t << j);
-    }
-    return res;
-}
-
-static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
-{
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        __m128i tmp = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
-        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
-                                       vreinterpretq_u16_m128i(tmp));
-        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
-        res |= (t << j);
-    }
-    return res;
-}
-
-#define SSE2NEON_CMP_RANGES_IS_BYTE 1
-#define SSE2NEON_CMP_RANGES_IS_WORD 0
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
-    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
-    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
-    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
-    prefix##IMPL(word, int, s, prefix##IS_WORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
-
-#undef SSE2NEON_CMP_RANGES_IS_BYTE
-#undef SSE2NEON_CMP_RANGES_IS_WORD
-
-static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint8x16_t mtx =
-        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x10000 - (1 << la);
-    int tb = 0x10000 - (1 << lb);
-    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
-    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
-    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
-
-    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
-    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
-    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
-    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
-    res_lo = vand_u8(res_lo, vec_mask);
-    res_hi = vand_u8(res_hi, vec_mask);
-
-    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
-    return res;
-}
-
-static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
-{
-    uint16x8_t mtx =
-        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x100 - (1 << la);
-    int tb = 0x100 - (1 << lb);
-    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
-    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
-    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
-    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
-    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
-    mtx = vbslq_u16(vec1, tmp, mtx);
-    mtx = vandq_u16(mtx, vec_mask);
-    return _sse2neon_vaddvq_u16(mtx);
-}
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
-
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
-        int bound, int la, int lb, __m128i mtx[16])                            \
-    {                                                                          \
-        int res = 0;                                                           \
-        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
-        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
-            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
-            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
-        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
-                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
-            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
-        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
-        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
-        for (int j = 0; j < lb; j++) {                                         \
-            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
-                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
-        }                                                                      \
-        for (int j = lb; j < bound; j++) {                                     \
-            mtx[j] = vreinterpretq_m128i_u##size(                              \
-                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
-        }                                                                      \
-        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
-            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
-        for (int i = 0; i < bound; i++) {                                      \
-            int val = 1;                                                       \
-            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
-                val &= ptr[k * bound + j];                                     \
-            res += val << i;                                                   \
-        }                                                                      \
-        return res;                                                            \
-    }
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
-    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
-    prefix##IMPL(16, 8, prefix##IS_UWORD)
-/* clang-format on */
-
-SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
-
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
-
-/* clang-format off */
-#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
-    prefix##IMPL(byte)                              \
-    prefix##IMPL(word)
-/* clang-format on */
-
-SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
-
-#define SSE2NEON_CMPESTR_LIST                          \
-    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
-    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
-    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
-    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
-    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
-    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
-
-enum {
-#define _(name, func_suffix) name,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
-static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
-#define _(name, func_suffix) _sse2neon_##func_suffix,
-    SSE2NEON_CMPESTR_LIST
-#undef _
-};
-
-FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
-{
-    switch (imm8 & 0x30) {
-    case _SIDD_NEGATIVE_POLARITY:
-        res ^= 0xffffffff;
-        break;
-    case _SIDD_MASKED_NEGATIVE_POLARITY:
-        res ^= (1 << lb) - 1;
-        break;
-    default:
-        break;
-    }
-
-    return res & ((bound == 8) ? 0xFF : 0xFFFF);
-}
-
-FORCE_INLINE int _sse2neon_clz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanForward(&cnt, x))
-        return cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_clz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctz(unsigned int x)
-{
-#if _MSC_VER
-    DWORD cnt = 0;
-    if (_BitScanReverse(&cnt, x))
-        return 31 - cnt;
-    return 32;
-#else
-    return x != 0 ? __builtin_ctz(x) : 32;
-#endif
-}
-
-FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
-{
-#if _MSC_VER
-    unsigned long cnt;
-#ifdef defined(SSE2NEON_HAS_BITSCAN64)
-    (defined(_M_AMD64) || defined(__x86_64__))
-        if((_BitScanForward64(&cnt, x))
-            return (int)(cnt);
-#else
-    if (_BitScanForward(&cnt, (unsigned long) (x)))
-        return (int) cnt;
-    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
-        return (int) (cnt + 32);
-#endif
-    return 64;
-#else
-    return x != 0 ? __builtin_ctzll(x) : 64;
-#endif
-}
-
-#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
-
-#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = (imm & 0x01) ? 8 : 16
-
-#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
-    int tmp1 = la ^ (la >> 31);                  \
-    la = tmp1 - (la >> 31);                      \
-    int tmp2 = lb ^ (lb >> 31);                  \
-    lb = tmp2 - (lb >> 31);                      \
-    la = SSE2NEON_MIN(la, bound);                \
-    lb = SSE2NEON_MIN(lb, bound)
-
-// Compare all pairs of character in string a and b,
-// then aggregate the result.
-// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
-// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
-// string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
-    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
-    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
-
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
-    return (r2 == 0) ? bound                                     \
-                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                      : _sse2neon_ctz(r2))
-
-#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
-    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if (imm8 & 0x40) {                                                         \
-        if (bound == 8) {                                                      \
-            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
-                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
-            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
-                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
-        } else {                                                               \
-            uint8x16_t vec_r2 =                                                \
-                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
-            uint8x16_t tmp =                                                   \
-                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
-        }                                                                      \
-    } else {                                                                   \
-        if (bound == 16) {                                                     \
-            dst = vreinterpretq_m128i_u16(                                     \
-                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
-        } else {                                                               \
-            dst = vreinterpretq_m128i_u8(                                      \
-                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
-        }                                                                      \
-    }                                                                          \
-    return dst
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and returns 1 if b did not contain a null character and the
-// resulting mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
-FORCE_INLINE int _mm_cmpestra(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    int lb_cpy = lb;
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return !r2 & (lb_cpy > bound);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
-FORCE_INLINE int _mm_cmpestrc(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
-FORCE_INLINE int _mm_cmpestri(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control
-// in imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
-FORCE_INLINE __m128i
-_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
-FORCE_INLINE int _mm_cmpestro(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
-FORCE_INLINE int _mm_cmpestrs(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings in a and b with lengths la and lb using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
-FORCE_INLINE int _mm_cmpestrz(__m128i a,
-                              int la,
-                              __m128i b,
-                              int lb,
-                              const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    return lb <= (bound - 1);
-}
-
-#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
-    do {                                                                 \
-        if (imm8 & 0x01) {                                               \
-            uint16x8_t equal_mask_##str =                                \
-                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
-        } else {                                                         \
-            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
-                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
-            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
-            uint64_t matches_##str =                                     \
-                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
-            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
-        }                                                                \
-    } while (0)
-
-#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
-    int la, lb;                                  \
-    do {                                         \
-        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
-        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
-    } while (0)
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if b did not contain a null character and the resulting
-// mask was zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
-FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return !r2 & (lb >= bound);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
-FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 != 0;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated index in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
-FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and store the generated mask in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
-FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns bit 0 of the resulting bit mask.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
-FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
-    return r2 & 1;
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in a was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
-FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int la;
-    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
-    return la <= (bound - 1);
-}
-
-// Compare packed strings with implicit lengths in a and b using the control in
-// imm8, and returns 1 if any character in b was null, and 0 otherwise.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
-FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
-{
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
-    int lb;
-    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
-    return lb <= (bound - 1);
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    return vreinterpretq_m128i_s64(vshrq_n_s64(
-        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
-        63));
-#endif
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32ch(crc, v);
-#else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cw(crc, v);
-#else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v, and stores the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
-    crc = __crc32cb(crc, v);
-#else
-    crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
-#endif
-    return crc;
-}
-
-/* AES */
-
-#if !defined(__ARM_FEATURE_CRYPTO)
-/* clang-format off */
-#define SSE2NEON_AES_SBOX(w)                                           \
-    {                                                                  \
-        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
-        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
-        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
-        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
-        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
-        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
-        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
-        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
-        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
-        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
-        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
-        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
-        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
-        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
-        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
-        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
-        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
-        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
-        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
-        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
-        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
-        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
-        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
-        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
-        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
-        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
-        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
-        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
-        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
-        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
-        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
-        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
-        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
-        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
-        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
-        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
-        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
-    }
-#define SSE2NEON_AES_RSBOX(w)                                          \
-    {                                                                  \
-        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
-        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
-        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
-        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
-        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
-        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
-        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
-        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
-        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
-        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
-        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
-        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
-        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
-        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
-        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
-        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
-        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
-        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
-        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
-        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
-        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
-        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
-        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
-        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
-        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
-        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
-        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
-        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
-        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
-        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
-        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
-        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
-        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
-        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
-        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
-        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
-        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
-    }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2NEON_AES_H0(x) (x)
-static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
-static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
-#undef SSE2NEON_AES_H0
-
-/* x_time function and matrix multiply function */
-#if !defined(__aarch64__)
-#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-#define SSE2NEON_MULTIPLY(x, y)                                  \
-    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
-     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
-     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
-     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
-#endif
-
-// In the absence of crypto extensions, implement aesenc using regular NEON
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// for more information.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    /* shift rows */
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    /* sub bytes */
-    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-    // look up each of the table. After each lookup, we load the next table
-    // which locates at the next 64-bytes. In the meantime, the index in the
-    // table would be smaller than it was, so the index parameters of
-    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    /* mix columns */
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    /* add round key */
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A implementation for a table-based AES */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
-    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
-     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
-// muliplying 'x' by 2 in GF(2^8)
-#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-// muliplying 'x' by 3 in GF(2^8)
-#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
-#define SSE2NEON_AES_U0(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
-#define SSE2NEON_AES_U1(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
-#define SSE2NEON_AES_U2(p) \
-    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
-#define SSE2NEON_AES_U3(p) \
-    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
-
-    // this generates a table containing every possible permutation of
-    // shift_rows() and sub_bytes() with mix_columns().
-    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
-        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
-    };
-#undef SSE2NEON_AES_B2W
-#undef SSE2NEON_AES_F2
-#undef SSE2NEON_AES_F3
-#undef SSE2NEON_AES_U0
-#undef SSE2NEON_AES_U1
-#undef SSE2NEON_AES_U2
-#undef SSE2NEON_AES_U3
-
-    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
-    uint32_t x1 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
-    uint32_t x2 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
-    uint32_t x3 =
-        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
-
-    // finish the modulo addition step in mix_columns()
-    __m128i out = _mm_set_epi32(
-        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
-         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
-        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
-         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
-        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
-         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
-        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
-         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, RoundKey);
-#endif
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // inverse mix columns
-    // muliplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    // add round key
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t i, e, f, g, h, v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    // inverse mix columns
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    // sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A implementation */
-    uint8_t v[16] = {
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-    };
-
-    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
-#endif
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(a);
-
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-
-    // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (int i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-#endif
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-#if defined(__aarch64__)
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-    uint8x16_t v = vreinterpretq_u8_m128i(a);
-    uint8x16_t w;
-
-    // multiplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    // multiplying 'v' by 2 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-    return vreinterpretq_m128i_u8(w);
-
-#else /* ARMv7-A NEON implementation */
-    uint8_t i, e, f, g, h, v[4][4];
-    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
-
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
-#endif
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-//
-// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
-// This instruction generates a round key for AES encryption. See
-// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
-// for details.
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-#if defined(__aarch64__)
-    uint8x16_t _a = vreinterpretq_u8_m128i(a);
-    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
-
-    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
-    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
-    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
-
-    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
-
-#else /* ARMv7-A NEON implementation */
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
-    for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
-    }
-    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-#endif
-}
-#undef SSE2NEON_AES_SBOX
-#undef SSE2NEON_AES_RSBOX
-
-#if defined(__aarch64__)
-#undef SSE2NEON_XT
-#undef SSE2NEON_MULTIPLY
-#endif
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-        vreinterpretq_u8_m128i(b));
-}
-
-// Perform one round of an AES decryption flow on data (state) in a using the
-// round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
-FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(veorq_u8(
-        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-        vreinterpretq_u8_m128i(RoundKey)));
-}
-
-// Perform the last round of an AES encryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-                         RoundKey);
-}
-
-// Perform the last round of an AES decryption flow on data (state) in a using
-// the round key in RoundKey, and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-{
-    return vreinterpretq_m128i_u8(
-        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
-        vreinterpretq_u8_m128i(RoundKey));
-}
-
-// Perform the InvMixColumns transformation on a and store the result in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-{
-    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
-}
-
-// Assist in expanding the AES cipher key by computing steps towards generating
-// a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst."
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-    // AESE does ShiftRows and SubBytes on A
-    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-
-    uint8x16_t dest = {
-        // Undo ShiftRows step from AESE and extract X1 and X3
-        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
-        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
-        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
-    };
-    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
-    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
-}
-#endif
-
-/* Others */
-
-// Perform a carry-less multiplication of two 64-bit integers, selected from a
-// and b according to imm8, and store the results in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
-    uint64x2_t a = vreinterpretq_u64_m128i(_a);
-    uint64x2_t b = vreinterpretq_u64_m128i(_b);
-    switch (imm & 0x11) {
-    case 0x00:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-    case 0x01:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-    case 0x10:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-    case 0x11:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
-    default:
-        abort();
-    }
-}
-
-FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
-}
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(a);
-#else
-    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
-    uint32_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-
-    vst1_u32(&count, count32x2_val);
-    return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(a);
-#else
-    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
-    uint64_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-    uint64x1_t count64x1_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-    count64x1_val = vpaddl_u32(count32x2_val);
-    vst1_u64(&count, count64x1_val);
-    return count;
-#endif
-}
-
-FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
-{
-    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-    // regardless of the value of the FZ bit.
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
-}
-
-// Return the current 64-bit value of the processor's time-stamp counter.
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
-FORCE_INLINE uint64_t _rdtsc(void)
-{
-#if defined(__aarch64__)
-    uint64_t val;
-
-    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
-     * system counter is at least 56 bits wide; from Armv8.6, the counter
-     * must be 64 bits wide.  So the system counter could be less than 64
-     * bits wide and it is attributed with the flag 'cap_user_time_short'
-     * is true.
-     */
-    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
-
-    return val;
-#else
-    uint32_t pmccntr, pmuseren, pmcntenset;
-    // Read the user mode Performance Monitoring Unit (PMU)
-    // User Enable Register (PMUSERENR) access permissions.
-    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
-    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
-        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
-        if (pmcntenset & 0x80000000UL) {  // Is it counting?
-            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
-            // The counter is set up to count every 64th cycle
-            return (uint64_t) (pmccntr) << 6;
-        }
-    }
-
-    // Fallback to syscall as we can't enable PMUSERENR in user mode.
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
-#endif
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC pop_options
-#endif
-
-#endif
\ No newline at end of file
diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
new file mode 100644
index 0000000000..7dea6740cb
--- /dev/null
+++ b/share/cmake/modules/Findsse2neon.cmake
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Locate sse2neon (header-only version)
+#
+# Variables defined by this module:
+#   sse2neon_FOUND          - Indicate whether the library was found or not
+#   sse2neon_INCLUDE_DIR    - Location of the header files
+#
+# Global targets defined by this module:
+#   sse2neon
+###############################################################################
+### Try to find package ###
+
+if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
+    # Find include directory
+    find_path(sse2neon_INCLUDE_DIR
+        NAMES
+            sse2neon.h
+        HINTS
+            ${sse2neon_ROOT}
+        PATH_SUFFIXES
+            include
+            sse2neon/include
+    )
+
+    # Override REQUIRED if package can be installed
+    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
+        set(sse2neon_FIND_REQUIRED FALSE)
+    endif()
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(sse2neon
+        REQUIRED_VARS 
+            sse2neon_INCLUDE_DIR 
+    )
+    set(sse2neon_FOUND ${sse2neon_FOUND})
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(sse2neon_FOUND AND NOT TARGET sse2neon)
+    # INTERFACE type since we know that this is a header-only library.
+    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
+    set(_sse2neon_TARGET_CREATE TRUE)
+endif()
+
+###############################################################################
+### Configure target ###
+
+if(_sse2neon_TARGET_CREATE)
+    set_target_properties(sse2neon PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
+    )
+
+    mark_as_advanced(sse2neon_INCLUDE_DIR)
+endif()
\ No newline at end of file

From a2537299fffc40209bed40829ccdcea499079d2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Wed, 15 Feb 2023 13:44:25 -0500
Subject: [PATCH 72/81] Removing specifics compiler options for NEON since they
 are not needed on Apple and might interfere with optimization if not careful.
 Updated CheckSupportSSE2.cmake and adding some checks in SSE.h to support
 universal build. Added comments and new define (USING_INTEL_SSE,
 USING_ARM_NEON and USING_CPP) in LogOpCPU_tests which should help with
 understand what is going on. Checking for SSE2 and ARM Neon only when
 OCIO_USE_SSE is ON.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/cmake/utils/CheckSupportSSE2.cmake |  2 ++
 share/cmake/utils/CompilerFlags.cmake    | 10 ++++++----
 tests/cpu/ops/log/LogOpCPU_tests.cpp     | 14 +++++++-------
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake
index 07fecbd7a5..de2e734868 100644
--- a/share/cmake/utils/CheckSupportSSE2.cmake
+++ b/share/cmake/utils/CheckSupportSSE2.cmake
@@ -7,6 +7,8 @@ set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
 set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}")
 set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
 
+set(_cmake_osx_architectures_old "${CMAKE_OSX_ARCHITECTURES}")
+
 if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8)
     # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger
     # unrelated warnings causing a detection failure. So, the code disables all warnings to focus
diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 3346c22a01..87292785fa 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -25,14 +25,16 @@ endif()
 
 ###############################################################################
 # Define if SSE2 can be used.
-# Check for SSE2 first since some compile flags need to be set on Apple ARM.
 
-include(CheckSupportSSE2)
+# Check for SSE2 only if the user asked for it.
+if(OCIO_USE_SSE)
+    include(CheckSupportSSE2)
+endif()
 
-if(NOT HAVE_SSE2)
+if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
     set(OCIO_USE_SSE OFF)
-endif(NOT HAVE_SSE2)
+endif()
 
 ###############################################################################
 # Compile flags
diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 06e98fe0da..313cf4ee23 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#ifdef USE_SSE
+#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From 7afcd7ae61d2be51da597e8e734e7dede865a68b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 09:18:49 -0500
Subject: [PATCH 73/81] Tweaking comments, some cmake variables naming and
 other minors changes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/cmake/utils/CompilerFlags.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 87292785fa..a445466bdf 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -26,7 +26,6 @@ endif()
 ###############################################################################
 # Define if SSE2 can be used.
 
-# Check for SSE2 only if the user asked for it.
 if(OCIO_USE_SSE)
     include(CheckSupportSSE2)
 endif()

From 3b139b6e3915be43465ec3f4b78329fa41f68c08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Thu, 16 Feb 2023 11:57:32 -0500
Subject: [PATCH 74/81] Adding a new option OCIO_USE_SIMD which does the same
 thing as OCIO_USE_SSE (they are sync up). Will replace OCIO_USE_SSE
 eventually.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/cmake/utils/CompilerFlags.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index a445466bdf..75d29b245b 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -26,13 +26,13 @@ endif()
 ###############################################################################
 # Define if SSE2 can be used.
 
-if(OCIO_USE_SSE)
+if(OCIO_USE_SIMD)
     include(CheckSupportSSE2)
 endif()
 
 if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON)
     message(STATUS "Disabling SSE optimizations, as the target doesn't support them")
-    set(OCIO_USE_SSE OFF)
+    set(OCIO_USE_SIMD OFF)
 endif()
 
 ###############################################################################

From fbe3c15558005cf3025dc19665716051a0b8d178 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 08:57:13 -0500
Subject: [PATCH 75/81] Fix last issue discovered by unit test in sseExp2
 Reverted defined changes in LogOpCPU_tests Updated ocio.bat with
 OCIO_USE_SIMD Minors comments changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 313cf4ee23..89cad03ba8 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USING_INTEL_SSE2 || USING_INTEL_SSE2_WITH_SSE2NEON
+#if USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From 9ebbcd5082c71f5ad4c8010b5af51e2f1f2f530e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 17 Feb 2023 09:24:39 -0500
Subject: [PATCH 76/81] Fixing typo, using #ifdef since this is what we were
 using before.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 tests/cpu/ops/log/LogOpCPU_tests.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp
index 89cad03ba8..06e98fe0da 100644
--- a/tests/cpu/ops/log/LogOpCPU_tests.cpp
+++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp
@@ -35,7 +35,7 @@ void TestLog(float logBase)
 
     // LogOpCPU implementation uses optimized logarithm approximation
     // cannot use strict comparison.
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 5e-5f;
 #else
     const float error = 1e-5f;
@@ -477,7 +477,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;
@@ -490,7 +490,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error);
@@ -525,7 +525,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     
     // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }.
     OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error);
 #else
     OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error);
@@ -552,7 +552,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
     OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true);
     pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error2 = 1e-5f;
 #else
     const float error2 = 1e-7f;
@@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test)
 
     // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }.
     OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2);
-#if USE_SSE
+#ifdef USE_SSE
     OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2);
 #else
     OCIO_CHECK_EQUAL(rgba_nobreak[9], inf);
@@ -596,7 +596,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test)
     OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true);
     pRenderer->apply(rgbaImage, rgba, 3);
 
-#if USE_SSE
+#ifdef USE_SSE
     const float error = 1e-6f;
 #else
     const float error = 1e-7f;

From ad4b9488f6e4ad3030e2f67225605411d87b6ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 3 Mar 2023 11:41:37 -0500
Subject: [PATCH 77/81] Documentations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 docs/quick_start/installation.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst
index fc58c5ec6f..511f67fbf0 100644
--- a/docs/quick_start/installation.rst
+++ b/docs/quick_start/installation.rst
@@ -296,6 +296,9 @@ any of your installed libraries are not universal. The easiest way to address th
 OCIO_INSTALL_EXT_PACKAGES=ALL in order to let OCIO build everything. Alternatively, you may set 
 CMAKE_OSX_ARCHITECTURES to just the platform you are targeting.
 
+Please note that OCIO dependencies must be built as universal libraries as well. If you can't build all the 
+dependencies as universal libraries, you can set ``OCIO_INSTAL_EXT_PACKAGES=ALL`` and OCIO will handle it.
+
 Several command-line tools (such as ``ocioconvert``) require reading or writing image files.
 If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO
 and therefore you will be limited to using OpenEXR files with these tools rather than the

From 58b37f6f5fa5d600da430b5b0a8bb8c80a212b5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 13 Mar 2023 10:49:01 -0400
Subject: [PATCH 78/81] Fixing issue for the static build test in CI workflow
 Removing Findsse2neon as it is not needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/cmake/modules/Findsse2neon.cmake | 58 --------------------------
 1 file changed, 58 deletions(-)
 delete mode 100644 share/cmake/modules/Findsse2neon.cmake

diff --git a/share/cmake/modules/Findsse2neon.cmake b/share/cmake/modules/Findsse2neon.cmake
deleted file mode 100644
index 7dea6740cb..0000000000
--- a/share/cmake/modules/Findsse2neon.cmake
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-#
-# Locate sse2neon (header-only version)
-#
-# Variables defined by this module:
-#   sse2neon_FOUND          - Indicate whether the library was found or not
-#   sse2neon_INCLUDE_DIR    - Location of the header files
-#
-# Global targets defined by this module:
-#   sse2neon
-###############################################################################
-### Try to find package ###
-
-if(NOT OCIO_INSTALL_EXT_PACKAGES STREQUAL ALL)
-    # Find include directory
-    find_path(sse2neon_INCLUDE_DIR
-        NAMES
-            sse2neon.h
-        HINTS
-            ${sse2neon_ROOT}
-        PATH_SUFFIXES
-            include
-            sse2neon/include
-    )
-
-    # Override REQUIRED if package can be installed
-    if(OCIO_INSTALL_EXT_PACKAGES STREQUAL MISSING)
-        set(sse2neon_FIND_REQUIRED FALSE)
-    endif()
-
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(sse2neon
-        REQUIRED_VARS 
-            sse2neon_INCLUDE_DIR 
-    )
-    set(sse2neon_FOUND ${sse2neon_FOUND})
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(sse2neon_FOUND AND NOT TARGET sse2neon)
-    # INTERFACE type since we know that this is a header-only library.
-    add_library(sse2neon INTERFACE IMPORTED GLOBAL)
-    set(_sse2neon_TARGET_CREATE TRUE)
-endif()
-
-###############################################################################
-### Configure target ###
-
-if(_sse2neon_TARGET_CREATE)
-    set_target_properties(sse2neon PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES ${sse2neon_INCLUDE_DIR}
-    )
-
-    mark_as_advanced(sse2neon_INCLUDE_DIR)
-endif()
\ No newline at end of file

From 1ac7000994334d5f2453490382e07d4499b2ad99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Fri, 24 Mar 2023 10:15:24 -0400
Subject: [PATCH 79/81] Fixing an issue during rebase. Removing
 ocio_check_dependency_version.cmake as it is not needed anymore. Changed how
 we detect if the installed OpenEXR and OpenImageIO are compatible with OCIO.
 It should be more robust now.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt                                |  1 -
 .../macros/CheckForOpenEXRCompatibility.cmake | 66 +++++++++++++++++++
 .../ocio_check_dependency_version.cmake       | 38 -----------
 share/cmake/modules/FindExtPackages.cmake     | 13 ++--
 4 files changed, 70 insertions(+), 48 deletions(-)
 create mode 100644 share/cmake/macros/CheckForOpenEXRCompatibility.cmake
 delete mode 100644 share/cmake/macros/ocio_check_dependency_version.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e972eb8023..2752af64fb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,7 +186,6 @@ option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)
 # TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated.
 mark_as_advanced(OCIO_USE_SSE)
 option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE})
-option(OCIO_USE_OIIO_CMAKE_CONFIG "Specify whether to look for OIIO using the generated CMake Config script instead of the custom FindOpenImageIO.cmake script" OFF)
 option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF)
 
 ###############################################################################
diff --git a/share/cmake/macros/CheckForOpenEXRCompatibility.cmake b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake
new file mode 100644
index 0000000000..bd79eb96e1
--- /dev/null
+++ b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright Contributors to the OpenColorIO Project.
+#
+# Check for compatibility between OpenEXR and OpenImageIO since OCIO requires OpenEXR 3+.
+#
+
+message(STATUS "Checking if the OpenImageIO found is built with OpenEXR 3+...")
+
+find_path (OpenImageIO_INCLUDE_DIR
+    NAMES
+        OpenImageIO/imageio.h
+    HINTS
+        ${OpenImageIO_ROOT}
+        # Assuming that OpenImageIO was installed normally, go back a few folders down
+        # to get the equivalent of OpenImageIO_ROOT.
+        ${OpenImageIO_DIR}/../../..
+    PATH_SUFFIXES
+        OpenImageIO/include
+        include
+)
+
+if (NOT OpenImageIO_INCLUDE_DIR)
+    message(STATUS "${ColorWarning}Could not find OpenImageIO header to evaluate the OpenEXR version.")
+    message(STATUS "Please provide the OpenImageIO_DIR variable.")
+    message(STATUS "If your OpenImageIO's files are located in different root directory, \
+please provide the OpenImageIO_ROOT where the include files are located.${ColorReset}")
+endif()
+
+# Try to figure out version number
+set (OIIO_VERSION_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/oiioversion.h")
+if (EXISTS "${OIIO_VERSION_HEADER}")
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MAJOR .*$")
+    string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MAJOR ${TMP})
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MINOR .*$")
+    string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MINOR ${TMP})
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_PATCH .*$")
+    string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_PATCH ${TMP})
+    file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_TWEAK .*$")
+    if (TMP)
+        string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_TWEAK ${TMP})
+    else ()
+        set (OpenImageIO_VERSION_TWEAK 0)
+    endif ()
+    set (OpenImageIO_VERSION "${OpenImageIO_VERSION_MAJOR}.${OpenImageIO_VERSION_MINOR}.${OpenImageIO_VERSION_PATCH}.${OpenImageIO_VERSION_TWEAK}")
+endif ()
+
+set (OIIO_IMATH_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/Imath.h")
+if (EXISTS "${OIIO_IMATH_HEADER}")
+    file(STRINGS "${OIIO_IMATH_HEADER}" TMP REGEX "^#define OIIO_USING_IMATH .*$")
+    string(REGEX MATCHALL "[0-9]" OIIO_IMATH_VERSION ${TMP})
+    if (OIIO_IMATH_VERSION LESS 3)
+        message(STATUS "Skipping OpenImageIO built against OpenEXR 2, please use version 3 or greater.")
+    else()
+        set(is_OpenEXR_VERSION_valid TRUE)
+    endif()
+endif()
+
+# clean up variables
+unset(OpenImageIO_INCLUDE_DIR)
+unset(OIIO_VERSION_HEADER)
+unset(OIIO_VERSION_MAJOR)
+unset(OIIO_VERSION_MINOR)
+unset(OIIO_VERSION_PATCH)
+unset(OIIO_VERSION_TWEAK)
+unset(OIIO_IMATH_HEADER)
+unset(OIIO_IMATH_VERSION)
\ No newline at end of file
diff --git a/share/cmake/macros/ocio_check_dependency_version.cmake b/share/cmake/macros/ocio_check_dependency_version.cmake
deleted file mode 100644
index 2c2b741192..0000000000
--- a/share/cmake/macros/ocio_check_dependency_version.cmake
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright Contributors to the OpenColorIO Project.
-
-###################################################################################################
-# ocio_check_dependency_version try to find the specified dependency and validate the version.
-#
-# Note that a function is used here to scoped-in any variables set by find_package. We do not want
-# those variables to be propagated to the caller of the function.
-#
-# Argument:
-#   dep_name is the name of the dependency (package). Please note that dep_name is case sensitive.
-#
-###################################################################################################
-
-function (ocio_check_dependency_version dep_name output)
-    cmake_parse_arguments(
-        # prefix - Must be different than the one used in ocio_handle_dependency.cmake.
-        ocio_cdv
-        # options
-        ""
-        # one value keywords
-        "MIN_VERSION"
-        # multi value keywords
-        ""
-        # args
-        ${ARGN})
-        
-    if (dep_name)
-        find_package(${dep_name} ${ocio_cdv_UNPARSED_ARGUMENTS})
-        if (ocio_cdv_MIN_VERSION AND ${dep_name}_VERSION)
-            if (${${dep_name}_VERSION} VERSION_GREATER_EQUAL ocio_cdv_MIN_VERSION)
-                set(${output} TRUE)
-            else()
-                set(${output} FALSE)
-            endif()
-        endif()
-    endif()
-endfunction()
\ No newline at end of file
diff --git a/share/cmake/modules/FindExtPackages.cmake b/share/cmake/modules/FindExtPackages.cmake
index 870b039bfc..48bc82de07 100644
--- a/share/cmake/modules/FindExtPackages.cmake
+++ b/share/cmake/modules/FindExtPackages.cmake
@@ -197,13 +197,10 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS)
     # Supported from OIIO 2.4+. Setting this for lower versions doesn't affect anything.
     set(OPENIMAGEIO_CONFIG_DO_NOT_FIND_IMATH 1)
 
-    include(ocio_check_dependency_version)
-    # Since OpenImageIO will try to find OpenEXR through its OpenImageIOConfig.cmake file,
-    # let's try to find OpenEXR first and if the version is too old, OCIO will not try to find
-    # OpenImageIO.
-    ocio_check_dependency_version(  OpenEXR "is_OpenEXR_VERSION_valid" 
-                                    MIN_VERSION ${OpenEXR_MININUM_VERSION} 
-                                    CONFIG)
+    set(is_OpenEXR_VERSION_valid FALSE)
+    # Check for compatibility between OpenEXR and OpenImageIO.
+    # Will set is_OpenEXR_VERSION_valid to TRUE if valid.
+    include(CheckForOpenEXRCompatibility)
 
     # Do not try to find OpenImageIO if the version of OpenEXR is too old.                                    
     if (is_OpenEXR_VERSION_valid)
@@ -227,8 +224,6 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS)
                                  MIN_VERSION ${OIIO_VERSION}
                                  RECOMMENDED_VERSION ${OIIO_RECOMMENDED_VERSION}
                                  PROMOTE_TARGET OpenImageIO::OpenImageIO)
-    else()
-        message(WARNING "Skipping OpenImageIO because the OpenEXR found by OpenImageIO is too old (under ${OpenEXR_MININUM_VERSION})")
     endif()
 endif()
 

From a03fcc2f519763f3f276a53e2cdb4bd65abfac66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 27 Mar 2023 11:31:32 -0400
Subject: [PATCH 80/81] minor typo and removing compiler options that should
 not be there (merge issue)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 share/cmake/utils/CompilerFlags.cmake | 3 ---
 src/OpenColorIO/SSE.h                 | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake
index 41c0f62ada..5eefc087cd 100644
--- a/share/cmake/utils/CompilerFlags.cmake
+++ b/share/cmake/utils/CompilerFlags.cmake
@@ -55,9 +55,6 @@ elseif(USE_CLANG)
 
     # Use of 'register' specifier must be removed for C++17 support.
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-deprecated-register")
-    if (HAVE_SSE2 AND HAVE_NEON)
-        set(PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS};-march=armv8-a+fp+simd+crypto+crc")
-    endif()
 elseif(USE_GCC)
 
     set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-DUSE_GCC")
diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h
index 1aa853997e..05eae7136f 100644
--- a/src/OpenColorIO/SSE.h
+++ b/src/OpenColorIO/SSE.h
@@ -164,7 +164,7 @@ static const __m128 PNEXP2 = _mm_set1_ps((float)2.414427569091865207710e-1);
 static const __m128 PNEXP1 = _mm_set1_ps((float)6.930038344665415134202e-1);
 static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 
-// Note: The above polynomials have been chosen to acheive a precision of
+// Note: The above polynomials have been chosen to achieve a precision of
 // approximately 15 bits of mantissa.
 
 
@@ -175,8 +175,8 @@ static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644);
 // over a restricted range.
 inline __m128 sseLog2(__m128 x)
 {
-    // y = log2( x ) = log2( 2^exponant * mantissa ) 
-    //               = exponant + log2( mantissa )
+    // y = log2( x ) = log2( 2^exponent * mantissa ) 
+    //               = exponent + log2( mantissa )
 
     __m128 mantissa
         = _mm_or_ps(                                    // OR with EONE

From c3ed4581bc5a93cd0582cbbc6f47c3aba4c03b1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= <cedrik.fuoco@autodesk.com>
Date: Mon, 3 Apr 2023 09:28:45 -0400
Subject: [PATCH 81/81] added comment for sse2neon compiler flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Cédrik Fuoco <cedrik.fuoco@autodesk.com>
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2752af64fb..d6353e8b03 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -223,6 +223,11 @@ if(HAVE_NEON AND OCIO_USE_SIMD)
             sse2neon/include
     )
 
+    # As per instructions on sse2neon's GitHub page, the following compiler flags should be used:
+    # "-march=armv8-a+fp+simd+crypto+crc". These flags are required for some ARM platforms that do
+    # not enable floating point calculations or SIMD instructions by default. However, for ARM64
+    # (Apple ARM platform) and x86_64 platforms, these features are already enabled by default.
+    # Therefore, no additional compiler flags are needed.
     if (NOT sse2neon_INCLUDE_DIR)
         include(Installsse2neon)
     else()