diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md
index 16f6c58cbb..0916befe04 100644
--- a/crates/core_arch/missing-x86.md
+++ b/crates/core_arch/missing-x86.md
@@ -51,102 +51,6 @@
 </p></details>
 
 
-<details><summary>["AVX512BW"]</summary><p>
-
-  * [ ] [`_cvtmask32_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask32_u32)
-  * [ ] [`_cvtmask64_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask64_u64)
-  * [ ] [`_cvtu32_mask32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtu32_mask32)
-  * [ ] [`_cvtu64_mask64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtu64_mask64)
-  * [ ] [`_kortest_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortest_mask32_u8)
-  * [ ] [`_kortest_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortest_mask64_u8)
-  * [ ] [`_kortestc_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestc_mask32_u8)
-  * [ ] [`_kortestc_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestc_mask64_u8)
-  * [ ] [`_kortestz_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestz_mask32_u8)
-  * [ ] [`_kortestz_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestz_mask64_u8)
-  * [ ] [`_kshiftli_mask32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftli_mask32)
-  * [ ] [`_kshiftli_mask64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftli_mask64)
-  * [ ] [`_kshiftri_mask32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftri_mask32)
-  * [ ] [`_kshiftri_mask64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftri_mask64)
-  * [ ] [`_ktest_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktest_mask32_u8)
-  * [ ] [`_ktest_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktest_mask64_u8)
-  * [ ] [`_ktestc_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestc_mask32_u8)
-  * [ ] [`_ktestc_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestc_mask64_u8)
-  * [ ] [`_ktestz_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestz_mask32_u8)
-  * [ ] [`_ktestz_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestz_mask64_u8)
-  * [ ] [`_mm512_kunpackd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kunpackd)
-  * [ ] [`_mm512_kunpackw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kunpackw)
-</p></details>
-
-
-<details><summary>["AVX512BW", "AVX512VL"]</summary><p>
-
-  * [ ] [`_mm256_mask_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_add_epi16)
-  * [ ] [`_mm256_mask_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_add_epi8)
-  * [ ] [`_mm256_mask_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_and_epi16)
-  * [ ] [`_mm256_mask_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_and_epi8)
-  * [ ] [`_mm256_mask_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epi16)
-  * [ ] [`_mm256_mask_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epi8)
-  * [ ] [`_mm256_mask_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epu16)
-  * [ ] [`_mm256_mask_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epu8)
-  * [ ] [`_mm256_mask_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epi16)
-  * [ ] [`_mm256_mask_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epi8)
-  * [ ] [`_mm256_mask_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epu16)
-  * [ ] [`_mm256_mask_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epu8)
-  * [ ] [`_mm256_mask_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_mul_epi16)
-  * [ ] [`_mm256_mask_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_mul_epi8)
-  * [ ] [`_mm256_mask_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_or_epi16)
-  * [ ] [`_mm256_mask_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_or_epi8)
-  * [ ] [`_mm256_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_epi16)
-  * [ ] [`_mm256_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_epi8)
-  * [ ] [`_mm256_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_and_epi16)
-  * [ ] [`_mm256_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_and_epi8)
-  * [ ] [`_mm256_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epi16)
-  * [ ] [`_mm256_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epi8)
-  * [ ] [`_mm256_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epu16)
-  * [ ] [`_mm256_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epu8)
-  * [ ] [`_mm256_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epi16)
-  * [ ] [`_mm256_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epi8)
-  * [ ] [`_mm256_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epu16)
-  * [ ] [`_mm256_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epu8)
-  * [ ] [`_mm256_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_epi16)
-  * [ ] [`_mm256_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_epi8)
-  * [ ] [`_mm256_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_or_epi16)
-  * [ ] [`_mm256_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_or_epi8)
-  * [ ] [`_mm_mask_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_add_epi16)
-  * [ ] [`_mm_mask_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_add_epi8)
-  * [ ] [`_mm_mask_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_and_epi16)
-  * [ ] [`_mm_mask_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_and_epi8)
-  * [ ] [`_mm_mask_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epi16)
-  * [ ] [`_mm_mask_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epi8)
-  * [ ] [`_mm_mask_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epu16)
-  * [ ] [`_mm_mask_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epu8)
-  * [ ] [`_mm_mask_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epi16)
-  * [ ] [`_mm_mask_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epi8)
-  * [ ] [`_mm_mask_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epu16)
-  * [ ] [`_mm_mask_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epu8)
-  * [ ] [`_mm_mask_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_mul_epi16)
-  * [ ] [`_mm_mask_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_mul_epi8)
-  * [ ] [`_mm_mask_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_or_epi16)
-  * [ ] [`_mm_mask_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_or_epi8)
-  * [ ] [`_mm_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_epi16)
-  * [ ] [`_mm_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_epi8)
-  * [ ] [`_mm_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_and_epi16)
-  * [ ] [`_mm_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_and_epi8)
-  * [ ] [`_mm_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epi16)
-  * [ ] [`_mm_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epi8)
-  * [ ] [`_mm_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epu16)
-  * [ ] [`_mm_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epu8)
-  * [ ] [`_mm_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epi16)
-  * [ ] [`_mm_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epi8)
-  * [ ] [`_mm_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epu16)
-  * [ ] [`_mm_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epu8)
-  * [ ] [`_mm_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_epi16)
-  * [ ] [`_mm_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_epi8)
-  * [ ] [`_mm_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_or_epi16)
-  * [ ] [`_mm_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_or_epi8)
-</p></details>
-
-
 <details><summary>["AVX512_FP16"]</summary><p>
 
   * [ ] [`_mm256_castpd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
index 91fef37895..4c637f49f3 100644
--- a/crates/core_arch/src/simd.rs
+++ b/crates/core_arch/src/simd.rs
@@ -743,3 +743,142 @@ simd_ty!(
     x6,
     x7
 );
+
+// 1024-bit wide types:
+simd_ty!(
+    u16x64[u16]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31,
+    x32,
+    x33,
+    x34,
+    x35,
+    x36,
+    x37,
+    x38,
+    x39,
+    x40,
+    x41,
+    x42,
+    x43,
+    x44,
+    x45,
+    x46,
+    x47,
+    x48,
+    x49,
+    x50,
+    x51,
+    x52,
+    x53,
+    x54,
+    x55,
+    x56,
+    x57,
+    x58,
+    x59,
+    x60,
+    x61,
+    x62,
+    x63
+);
+simd_ty!(
+    i32x32[i32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
+simd_ty!(
+    u32x32[u32]:
+    x0,
+    x1,
+    x2,
+    x3,
+    x4,
+    x5,
+    x6,
+    x7,
+    x8,
+    x9,
+    x10,
+    x11,
+    x12,
+    x13,
+    x14,
+    x15,
+    x16,
+    x17,
+    x18,
+    x19,
+    x20,
+    x21,
+    x22,
+    x23,
+    x24,
+    x25,
+    x26,
+    x27,
+    x28,
+    x29,
+    x30,
+    x31
+);
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index dd74d11786..66f6ee1259 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -2,7 +2,7 @@ use crate::{
     arch::asm,
     core_arch::{simd::*, x86::*},
     intrinsics::simd::*,
-    mem, ptr,
+    ptr,
 };
 
 #[cfg(test)]
@@ -17,11 +17,8 @@ use stdarch_test::assert_instr;
 #[cfg_attr(test, assert_instr(vpabsw))]
 pub unsafe fn _mm512_abs_epi16(a: __m512i) -> __m512i {
     let a = a.as_i16x32();
-    // all-0 is a properly initialized i16x32
-    let zero: i16x32 = mem::zeroed();
-    let sub = simd_sub(zero, a);
-    let cmp: i16x32 = simd_gt(a, zero);
-    transmute(simd_select(cmp, a, sub))
+    let cmp: i16x32 = simd_gt(a, i16x32::splat(0));
+    transmute(simd_select(cmp, a, simd_neg(a)))
 }
 
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -108,11 +105,8 @@ pub unsafe fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpabsb))]
 pub unsafe fn _mm512_abs_epi8(a: __m512i) -> __m512i {
     let a = a.as_i8x64();
-    // all-0 is a properly initialized i8x64
-    let zero: i8x64 = mem::zeroed();
-    let sub = simd_sub(zero, a);
-    let cmp: i8x64 = simd_gt(a, zero);
-    transmute(simd_select(cmp, a, sub))
+    let cmp: i8x64 = simd_gt(a, i8x64::splat(0));
+    transmute(simd_select(cmp, a, simd_neg(a)))
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -370,12 +364,7 @@ pub unsafe fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
 pub unsafe fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddusw(
-        a.as_u16x32(),
-        b.as_u16x32(),
-        _mm512_setzero_si512().as_u16x32(),
-        0b11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_add(a.as_u16x32(), b.as_u16x32()))
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -391,7 +380,8 @@ pub unsafe fn _mm512_mask_adds_epu16(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    transmute(vpaddusw(a.as_u16x32(), b.as_u16x32(), src.as_u16x32(), k))
+    let add = _mm512_adds_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, add, src.as_u16x32()))
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -402,12 +392,8 @@ pub unsafe fn _mm512_mask_adds_epu16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
 pub unsafe fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddusw(
-        a.as_u16x32(),
-        b.as_u16x32(),
-        _mm512_setzero_si512().as_u16x32(),
-        k,
-    ))
+    let add = _mm512_adds_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, add, u16x32::splat(0)))
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -423,12 +409,8 @@ pub unsafe fn _mm256_mask_adds_epu16(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    transmute(vpaddusw256(
-        a.as_u16x16(),
-        b.as_u16x16(),
-        src.as_u16x16(),
-        k,
-    ))
+    let add = _mm256_adds_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, add, src.as_u16x16()))
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -439,12 +421,8 @@ pub unsafe fn _mm256_mask_adds_epu16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
 pub unsafe fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpaddusw256(
-        a.as_u16x16(),
-        b.as_u16x16(),
-        _mm256_setzero_si256().as_u16x16(),
-        k,
-    ))
+    let add = _mm256_adds_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, add, u16x16::splat(0)))
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -455,7 +433,8 @@ pub unsafe fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
 pub unsafe fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddusw128(a.as_u16x8(), b.as_u16x8(), src.as_u16x8(), k))
+    let add = _mm_adds_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, add, src.as_u16x8()))
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -466,12 +445,8 @@ pub unsafe fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
 pub unsafe fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddusw128(
-        a.as_u16x8(),
-        b.as_u16x8(),
-        _mm_setzero_si128().as_u16x8(),
-        k,
-    ))
+    let add = _mm_adds_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, add, u16x8::splat(0)))
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
@@ -482,12 +457,7 @@ pub unsafe fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddusb(
-        a.as_u8x64(),
-        b.as_u8x64(),
-        _mm512_setzero_si512().as_u8x64(),
-        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_add(a.as_u8x64(), b.as_u8x64()))
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -498,7 +468,8 @@ pub unsafe fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddusb(a.as_u8x64(), b.as_u8x64(), src.as_u8x64(), k))
+    let add = _mm512_adds_epu8(a, b).as_u8x64();
+    transmute(simd_select_bitmask(k, add, src.as_u8x64()))
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -509,12 +480,8 @@ pub unsafe fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddusb(
-        a.as_u8x64(),
-        b.as_u8x64(),
-        _mm512_setzero_si512().as_u8x64(),
-        k,
-    ))
+    let add = _mm512_adds_epu8(a, b).as_u8x64();
+    transmute(simd_select_bitmask(k, add, u8x64::splat(0)))
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -525,7 +492,8 @@ pub unsafe fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpaddusb256(a.as_u8x32(), b.as_u8x32(), src.as_u8x32(), k))
+    let add = _mm256_adds_epu8(a, b).as_u8x32();
+    transmute(simd_select_bitmask(k, add, src.as_u8x32()))
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -536,12 +504,8 @@ pub unsafe fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpaddusb256(
-        a.as_u8x32(),
-        b.as_u8x32(),
-        _mm256_setzero_si256().as_u8x32(),
-        k,
-    ))
+    let add = _mm256_adds_epu8(a, b).as_u8x32();
+    transmute(simd_select_bitmask(k, add, u8x32::splat(0)))
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -552,7 +516,8 @@ pub unsafe fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddusb128(a.as_u8x16(), b.as_u8x16(), src.as_u8x16(), k))
+    let add = _mm_adds_epu8(a, b).as_u8x16();
+    transmute(simd_select_bitmask(k, add, src.as_u8x16()))
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -563,12 +528,8 @@ pub unsafe fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddusb128(
-        a.as_u8x16(),
-        b.as_u8x16(),
-        _mm_setzero_si128().as_u8x16(),
-        k,
-    ))
+    let add = _mm_adds_epu8(a, b).as_u8x16();
+    transmute(simd_select_bitmask(k, add, u8x16::splat(0)))
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst.
@@ -579,12 +540,7 @@ pub unsafe fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
 pub unsafe fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddsw(
-        a.as_i16x32(),
-        b.as_i16x32(),
-        _mm512_setzero_si512().as_i16x32(),
-        0b11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_add(a.as_i16x32(), b.as_i16x32()))
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -600,7 +556,8 @@ pub unsafe fn _mm512_mask_adds_epi16(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    transmute(vpaddsw(a.as_i16x32(), b.as_i16x32(), src.as_i16x32(), k))
+    let add = _mm512_adds_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, add, src.as_i16x32()))
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -611,12 +568,8 @@ pub unsafe fn _mm512_mask_adds_epi16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
 pub unsafe fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddsw(
-        a.as_i16x32(),
-        b.as_i16x32(),
-        _mm512_setzero_si512().as_i16x32(),
-        k,
-    ))
+    let add = _mm512_adds_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, add, i16x32::splat(0)))
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -632,7 +585,8 @@ pub unsafe fn _mm256_mask_adds_epi16(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    transmute(vpaddsw256(a.as_i16x16(), b.as_i16x16(), src.as_i16x16(), k))
+    let add = _mm256_adds_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, add, src.as_i16x16()))
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -643,12 +597,8 @@ pub unsafe fn _mm256_mask_adds_epi16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
 pub unsafe fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpaddsw256(
-        a.as_i16x16(),
-        b.as_i16x16(),
-        _mm256_setzero_si256().as_i16x16(),
-        k,
-    ))
+    let add = _mm256_adds_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, add, i16x16::splat(0)))
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -659,7 +609,8 @@ pub unsafe fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
 pub unsafe fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddsw128(a.as_i16x8(), b.as_i16x8(), src.as_i16x8(), k))
+    let add = _mm_adds_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, add, src.as_i16x8()))
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -670,12 +621,8 @@ pub unsafe fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
 pub unsafe fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddsw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_setzero_si128().as_i16x8(),
-        k,
-    ))
+    let add = _mm_adds_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, add, i16x8::splat(0)))
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst.
@@ -686,12 +633,7 @@ pub unsafe fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddsb(
-        a.as_i8x64(),
-        b.as_i8x64(),
-        _mm512_setzero_si512().as_i8x64(),
-        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_add(a.as_i8x64(), b.as_i8x64()))
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -702,7 +644,8 @@ pub unsafe fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddsb(a.as_i8x64(), b.as_i8x64(), src.as_i8x64(), k))
+    let add = _mm512_adds_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, add, src.as_i8x64()))
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -713,12 +656,8 @@ pub unsafe fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpaddsb(
-        a.as_i8x64(),
-        b.as_i8x64(),
-        _mm512_setzero_si512().as_i8x64(),
-        k,
-    ))
+    let add = _mm512_adds_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, add, i8x64::splat(0)))
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -729,7 +668,8 @@ pub unsafe fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpaddsb256(a.as_i8x32(), b.as_i8x32(), src.as_i8x32(), k))
+    let add = _mm256_adds_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, add, src.as_i8x32()))
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -740,12 +680,8 @@ pub unsafe fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpaddsb256(
-        a.as_i8x32(),
-        b.as_i8x32(),
-        _mm256_setzero_si256().as_i8x32(),
-        k,
-    ))
+    let add = _mm256_adds_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, add, i8x32::splat(0)))
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -756,7 +692,8 @@ pub unsafe fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddsb128(a.as_i8x16(), b.as_i8x16(), src.as_i8x16(), k))
+    let add = _mm_adds_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, add, src.as_i8x16()))
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -767,12 +704,8 @@ pub unsafe fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpaddsb128(
-        a.as_i8x16(),
-        b.as_i8x16(),
-        _mm_setzero_si128().as_i8x16(),
-        k,
-    ))
+    let add = _mm_adds_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, add, i8x16::splat(0)))
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
@@ -955,12 +888,7 @@ pub unsafe fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
 pub unsafe fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubusw(
-        a.as_u16x32(),
-        b.as_u16x32(),
-        _mm512_setzero_si512().as_u16x32(),
-        0b11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_sub(a.as_u16x32(), b.as_u16x32()))
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -976,7 +904,8 @@ pub unsafe fn _mm512_mask_subs_epu16(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    transmute(vpsubusw(a.as_u16x32(), b.as_u16x32(), src.as_u16x32(), k))
+    let sub = _mm512_subs_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, sub, src.as_u16x32()))
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -987,12 +916,8 @@ pub unsafe fn _mm512_mask_subs_epu16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
 pub unsafe fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubusw(
-        a.as_u16x32(),
-        b.as_u16x32(),
-        _mm512_setzero_si512().as_u16x32(),
-        k,
-    ))
+    let sub = _mm512_subs_epu16(a, b).as_u16x32();
+    transmute(simd_select_bitmask(k, sub, u16x32::splat(0)))
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1008,12 +933,8 @@ pub unsafe fn _mm256_mask_subs_epu16(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    transmute(vpsubusw256(
-        a.as_u16x16(),
-        b.as_u16x16(),
-        src.as_u16x16(),
-        k,
-    ))
+    let sub = _mm256_subs_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, sub, src.as_u16x16()))
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1024,12 +945,8 @@ pub unsafe fn _mm256_mask_subs_epu16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
 pub unsafe fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpsubusw256(
-        a.as_u16x16(),
-        b.as_u16x16(),
-        _mm256_setzero_si256().as_u16x16(),
-        k,
-    ))
+    let sub = _mm256_subs_epu16(a, b).as_u16x16();
+    transmute(simd_select_bitmask(k, sub, u16x16::splat(0)))
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1040,7 +957,8 @@ pub unsafe fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
 pub unsafe fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubusw128(a.as_u16x8(), b.as_u16x8(), src.as_u16x8(), k))
+    let sub = _mm_subs_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, sub, src.as_u16x8()))
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1051,12 +969,8 @@ pub unsafe fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
 pub unsafe fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubusw128(
-        a.as_u16x8(),
-        b.as_u16x8(),
-        _mm_setzero_si128().as_u16x8(),
-        k,
-    ))
+    let sub = _mm_subs_epu16(a, b).as_u16x8();
+    transmute(simd_select_bitmask(k, sub, u16x8::splat(0)))
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
@@ -1067,12 +981,7 @@ pub unsafe fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubusb(
-        a.as_u8x64(),
-        b.as_u8x64(),
-        _mm512_setzero_si512().as_u8x64(),
-        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_sub(a.as_u8x64(), b.as_u8x64()))
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1083,7 +992,8 @@ pub unsafe fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubusb(a.as_u8x64(), b.as_u8x64(), src.as_u8x64(), k))
+    let sub = _mm512_subs_epu8(a, b).as_u8x64();
+    transmute(simd_select_bitmask(k, sub, src.as_u8x64()))
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1094,12 +1004,8 @@ pub unsafe fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubusb(
-        a.as_u8x64(),
-        b.as_u8x64(),
-        _mm512_setzero_si512().as_u8x64(),
-        k,
-    ))
+    let sub = _mm512_subs_epu8(a, b).as_u8x64();
+    transmute(simd_select_bitmask(k, sub, u8x64::splat(0)))
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1110,7 +1016,8 @@ pub unsafe fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpsubusb256(a.as_u8x32(), b.as_u8x32(), src.as_u8x32(), k))
+    let sub = _mm256_subs_epu8(a, b).as_u8x32();
+    transmute(simd_select_bitmask(k, sub, src.as_u8x32()))
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1121,12 +1028,8 @@ pub unsafe fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpsubusb256(
-        a.as_u8x32(),
-        b.as_u8x32(),
-        _mm256_setzero_si256().as_u8x32(),
-        k,
-    ))
+    let sub = _mm256_subs_epu8(a, b).as_u8x32();
+    transmute(simd_select_bitmask(k, sub, u8x32::splat(0)))
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1137,7 +1040,8 @@ pub unsafe fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubusb128(a.as_u8x16(), b.as_u8x16(), src.as_u8x16(), k))
+    let sub = _mm_subs_epu8(a, b).as_u8x16();
+    transmute(simd_select_bitmask(k, sub, src.as_u8x16()))
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1148,12 +1052,8 @@ pub unsafe fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubusb128(
-        a.as_u8x16(),
-        b.as_u8x16(),
-        _mm_setzero_si128().as_u8x16(),
-        k,
-    ))
+    let sub = _mm_subs_epu8(a, b).as_u8x16();
+    transmute(simd_select_bitmask(k, sub, u8x16::splat(0)))
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
@@ -1164,12 +1064,7 @@ pub unsafe fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
 pub unsafe fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubsw(
-        a.as_i16x32(),
-        b.as_i16x32(),
-        _mm512_setzero_si512().as_i16x32(),
-        0b11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_sub(a.as_i16x32(), b.as_i16x32()))
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1185,7 +1080,8 @@ pub unsafe fn _mm512_mask_subs_epi16(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    transmute(vpsubsw(a.as_i16x32(), b.as_i16x32(), src.as_i16x32(), k))
+    let sub = _mm512_subs_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1196,12 +1092,8 @@ pub unsafe fn _mm512_mask_subs_epi16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
 pub unsafe fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubsw(
-        a.as_i16x32(),
-        b.as_i16x32(),
-        _mm512_setzero_si512().as_i16x32(),
-        k,
-    ))
+    let sub = _mm512_subs_epi16(a, b).as_i16x32();
+    transmute(simd_select_bitmask(k, sub, i16x32::splat(0)))
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1217,7 +1109,8 @@ pub unsafe fn _mm256_mask_subs_epi16(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    transmute(vpsubsw256(a.as_i16x16(), b.as_i16x16(), src.as_i16x16(), k))
+    let sub = _mm256_subs_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1228,12 +1121,8 @@ pub unsafe fn _mm256_mask_subs_epi16(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
 pub unsafe fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpsubsw256(
-        a.as_i16x16(),
-        b.as_i16x16(),
-        _mm256_setzero_si256().as_i16x16(),
-        k,
-    ))
+    let sub = _mm256_subs_epi16(a, b).as_i16x16();
+    transmute(simd_select_bitmask(k, sub, i16x16::splat(0)))
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1244,7 +1133,8 @@ pub unsafe fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
 pub unsafe fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubsw128(a.as_i16x8(), b.as_i16x8(), src.as_i16x8(), k))
+    let sub = _mm_subs_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1255,12 +1145,8 @@ pub unsafe fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
 pub unsafe fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubsw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_setzero_si128().as_i16x8(),
-        k,
-    ))
+    let sub = _mm_subs_epi16(a, b).as_i16x8();
+    transmute(simd_select_bitmask(k, sub, i16x8::splat(0)))
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
@@ -1271,12 +1157,7 @@ pub unsafe fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubsb(
-        a.as_i8x64(),
-        b.as_i8x64(),
-        _mm512_setzero_si512().as_i8x64(),
-        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    ))
+    transmute(simd_saturating_sub(a.as_i8x64(), b.as_i8x64()))
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1287,7 +1168,8 @@ pub unsafe fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubsb(a.as_i8x64(), b.as_i8x64(), src.as_i8x64(), k))
+    let sub = _mm512_subs_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1298,12 +1180,8 @@ pub unsafe fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsubsb(
-        a.as_i8x64(),
-        b.as_i8x64(),
-        _mm512_setzero_si512().as_i8x64(),
-        k,
-    ))
+    let sub = _mm512_subs_epi8(a, b).as_i8x64();
+    transmute(simd_select_bitmask(k, sub, i8x64::splat(0)))
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1314,7 +1192,8 @@ pub unsafe fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpsubsb256(a.as_i8x32(), b.as_i8x32(), src.as_i8x32(), k))
+    let sub = _mm256_subs_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1325,12 +1204,8 @@ pub unsafe fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpsubsb256(
-        a.as_i8x32(),
-        b.as_i8x32(),
-        _mm256_setzero_si256().as_i8x32(),
-        k,
-    ))
+    let sub = _mm256_subs_epi8(a, b).as_i8x32();
+    transmute(simd_select_bitmask(k, sub, i8x32::splat(0)))
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1341,7 +1216,8 @@ pub unsafe fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubsb128(a.as_i8x16(), b.as_i8x16(), src.as_i8x16(), k))
+    let sub = _mm_subs_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1352,12 +1228,8 @@ pub unsafe fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpsubsb128(
-        a.as_i8x16(),
-        b.as_i8x16(),
-        _mm_setzero_si128().as_i8x16(),
-        k,
-    ))
+    let sub = _mm_subs_epi8(a, b).as_i8x16();
+    transmute(simd_select_bitmask(k, sub, i8x16::splat(0)))
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
@@ -1368,7 +1240,10 @@ pub unsafe fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
 pub unsafe fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmulhuw(a.as_u16x32(), b.as_u16x32()))
+    let a = simd_cast::<_, u32x32>(a.as_u16x32());
+    let b = simd_cast::<_, u32x32>(b.as_u16x32());
+    let r = simd_shr(simd_mul(a, b), u32x32::splat(16));
+    transmute(simd_cast::<u32x32, u16x32>(r))
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1464,7 +1339,10 @@ pub unsafe fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
 pub unsafe fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmulhw(a.as_i16x32(), b.as_i16x32()))
+    let a = simd_cast::<_, i32x32>(a.as_i16x32());
+    let b = simd_cast::<_, i32x32>(b.as_i16x32());
+    let r = simd_shr(simd_mul(a, b), i32x32::splat(16));
+    transmute(simd_cast::<i32x32, i16x32>(r))
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1752,7 +1630,9 @@ pub unsafe fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
 pub unsafe fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxuw(a.as_u16x32(), b.as_u16x32()))
+    let a = a.as_u16x32();
+    let b = b.as_u16x32();
+    transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1838,7 +1718,9 @@ pub unsafe fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
 pub unsafe fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxub(a.as_u8x64(), b.as_u8x64()))
+    let a = a.as_u8x64();
+    let b = b.as_u8x64();
+    transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1924,7 +1806,9 @@ pub unsafe fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
 pub unsafe fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxsw(a.as_i16x32(), b.as_i16x32()))
+    let a = a.as_i16x32();
+    let b = b.as_i16x32();
+    transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2010,7 +1894,9 @@ pub unsafe fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
 pub unsafe fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaxsb(a.as_i8x64(), b.as_i8x64()))
+    let a = a.as_i8x64();
+    let b = b.as_i8x64();
+    transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2096,7 +1982,9 @@ pub unsafe fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
 pub unsafe fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminuw(a.as_u16x32(), b.as_u16x32()))
+    let a = a.as_u16x32();
+    let b = b.as_u16x32();
+    transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2182,7 +2070,9 @@ pub unsafe fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
 pub unsafe fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminub(a.as_u8x64(), b.as_u8x64()))
+    let a = a.as_u8x64();
+    let b = b.as_u8x64();
+    transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2268,7 +2158,9 @@ pub unsafe fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
 pub unsafe fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminsw(a.as_i16x32(), b.as_i16x32()))
+    let a = a.as_i16x32();
+    let b = b.as_i16x32();
+    transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2354,7 +2246,9 @@ pub unsafe fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
 pub unsafe fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpminsb(a.as_i8x64(), b.as_i8x64()))
+    let a = a.as_i8x64();
+    let b = b.as_i8x64();
+    transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2451,7 +2345,7 @@ pub unsafe fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmplt_epu16_mask(a, b) & k1
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2473,7 +2367,7 @@ pub unsafe fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmplt_epu16_mask(a, b) & k1
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2495,7 +2389,7 @@ pub unsafe fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmplt_epu16_mask(a, b) & k1
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2517,7 +2411,7 @@ pub unsafe fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmplt_epu8_mask(a, b) & k1
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2539,7 +2433,7 @@ pub unsafe fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmplt_epu8_mask(a, b) & k1
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2561,7 +2455,7 @@ pub unsafe fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmplt_epu8_mask(a, b) & k1
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2583,7 +2477,7 @@ pub unsafe fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmplt_epi16_mask(a, b) & k1
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2605,7 +2499,7 @@ pub unsafe fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmplt_epi16_mask(a, b) & k1
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2627,7 +2521,7 @@ pub unsafe fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmplt_epi16_mask(a, b) & k1
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2649,7 +2543,7 @@ pub unsafe fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmplt_epi8_mask(a, b) & k1
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2671,7 +2565,7 @@ pub unsafe fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmplt_epi8_mask(a, b) & k1
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2693,7 +2587,7 @@ pub unsafe fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmplt_epi8_mask(a, b) & k1
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2715,7 +2609,7 @@ pub unsafe fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpgt_epu16_mask(a, b) & k1
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2737,7 +2631,7 @@ pub unsafe fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpgt_epu16_mask(a, b) & k1
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2759,7 +2653,7 @@ pub unsafe fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpgt_epu16_mask(a, b) & k1
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2781,7 +2675,7 @@ pub unsafe fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpgt_epu8_mask(a, b) & k1
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2803,7 +2697,7 @@ pub unsafe fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpgt_epu8_mask(a, b) & k1
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2825,7 +2719,7 @@ pub unsafe fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpgt_epu8_mask(a, b) & k1
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2847,7 +2741,7 @@ pub unsafe fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpgt_epi16_mask(a, b) & k1
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2869,7 +2763,7 @@ pub unsafe fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpgt_epi16_mask(a, b) & k1
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2891,7 +2785,7 @@ pub unsafe fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpgt_epi16_mask(a, b) & k1
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2913,7 +2807,7 @@ pub unsafe fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpgt_epi8_mask(a, b) & k1
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2935,7 +2829,7 @@ pub unsafe fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpgt_epi8_mask(a, b) & k1
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
@@ -2957,7 +2851,7 @@ pub unsafe fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpgt_epi8_mask(a, b) & k1
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -2979,7 +2873,7 @@ pub unsafe fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmple_epu16_mask(a, b) & k1
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3001,7 +2895,7 @@ pub unsafe fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmple_epu16_mask(a, b) & k1
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3023,7 +2917,7 @@ pub unsafe fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmple_epu16_mask(a, b) & k1
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
@@ -3045,7 +2939,7 @@ pub unsafe fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmple_epu8_mask(a, b) & k1
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
@@ -3067,7 +2961,7 @@ pub unsafe fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmple_epu8_mask(a, b) & k1
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.   
@@ -3089,7 +2983,7 @@ pub unsafe fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmple_epu8_mask(a, b) & k1
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3111,7 +3005,7 @@ pub unsafe fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmple_epi16_mask(a, b) & k1
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3133,7 +3027,7 @@ pub unsafe fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmple_epi16_mask(a, b) & k1
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3155,7 +3049,7 @@ pub unsafe fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmple_epi16_mask(a, b) & k1
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3177,7 +3071,7 @@ pub unsafe fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmple_epi8_mask(a, b) & k1
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3199,7 +3093,7 @@ pub unsafe fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmple_epi8_mask(a, b) & k1
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
@@ -3221,7 +3115,7 @@ pub unsafe fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmple_epi8_mask(a, b) & k1
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3243,7 +3137,7 @@ pub unsafe fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpge_epu16_mask(a, b) & k1
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3265,7 +3159,7 @@ pub unsafe fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpge_epu16_mask(a, b) & k1
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3287,7 +3181,7 @@ pub unsafe fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpge_epu16_mask(a, b) & k1
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3309,7 +3203,7 @@ pub unsafe fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpge_epu8_mask(a, b) & k1
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3331,7 +3225,7 @@ pub unsafe fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpge_epu8_mask(a, b) & k1
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3353,7 +3247,7 @@ pub unsafe fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpge_epu8_mask(a, b) & k1
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3375,7 +3269,7 @@ pub unsafe fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpge_epi16_mask(a, b) & k1
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3397,7 +3291,7 @@ pub unsafe fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpge_epi16_mask(a, b) & k1
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3419,7 +3313,7 @@ pub unsafe fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpge_epi16_mask(a, b) & k1
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3441,7 +3335,7 @@ pub unsafe fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpge_epi8_mask(a, b) & k1
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3463,7 +3357,7 @@ pub unsafe fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpge_epi8_mask(a, b) & k1
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
@@ -3485,7 +3379,7 @@ pub unsafe fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpge_epi8_mask(a, b) & k1
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3507,7 +3401,7 @@ pub unsafe fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpeq_epu16_mask(a, b) & k1
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3529,7 +3423,7 @@ pub unsafe fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpeq_epu16_mask(a, b) & k1
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3551,7 +3445,7 @@ pub unsafe fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpeq_epu16_mask(a, b) & k1
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3573,7 +3467,7 @@ pub unsafe fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpeq_epu8_mask(a, b) & k1
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3595,7 +3489,7 @@ pub unsafe fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpeq_epu8_mask(a, b) & k1
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3617,7 +3511,7 @@ pub unsafe fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpeq_epu8_mask(a, b) & k1
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3639,7 +3533,7 @@ pub unsafe fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpeq_epi16_mask(a, b) & k1
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3661,7 +3555,7 @@ pub unsafe fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpeq_epi16_mask(a, b) & k1
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3683,7 +3577,7 @@ pub unsafe fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpeq_epi16_mask(a, b) & k1
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3705,7 +3599,7 @@ pub unsafe fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpeq_epi8_mask(a, b) & k1
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3727,7 +3621,7 @@ pub unsafe fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpeq_epi8_mask(a, b) & k1
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k.
@@ -3749,7 +3643,7 @@ pub unsafe fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpeq_epi8_mask(a, b) & k1
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3771,7 +3665,7 @@ pub unsafe fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpneq_epu16_mask(a, b) & k1
+    _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3793,7 +3687,7 @@ pub unsafe fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpneq_epu16_mask(a, b) & k1
+    _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3815,7 +3709,7 @@ pub unsafe fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpneq_epu16_mask(a, b) & k1
+    _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3837,7 +3731,7 @@ pub unsafe fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpneq_epu8_mask(a, b) & k1
+    _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3859,7 +3753,7 @@ pub unsafe fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpneq_epu8_mask(a, b) & k1
+    _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3881,7 +3775,7 @@ pub unsafe fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpneq_epu8_mask(a, b) & k1
+    _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3903,7 +3797,7 @@ pub unsafe fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
-    _mm512_cmpneq_epi16_mask(a, b) & k1
+    _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3925,7 +3819,7 @@ pub unsafe fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
-    _mm256_cmpneq_epi16_mask(a, b) & k1
+    _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3947,7 +3841,7 @@ pub unsafe fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
-    _mm_cmpneq_epi16_mask(a, b) & k1
+    _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3969,7 +3863,7 @@ pub unsafe fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
-    _mm512_cmpneq_epi8_mask(a, b) & k1
+    _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -3991,7 +3885,7 @@ pub unsafe fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
-    _mm256_cmpneq_epi8_mask(a, b) & k1
+    _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
@@ -4013,7 +3907,7 @@ pub unsafe fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
-    _mm_cmpneq_epi8_mask(a, b) & k1
+    _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by `IMM8`, and store the results in mask vector k.
@@ -4028,7 +3922,17 @@ pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i16x32::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i16x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4047,7 +3951,18 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    vpcmpuw(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i16x32::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i16x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4062,7 +3977,17 @@ pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    vpcmpuw256(a, b, IMM8, 0b11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i16x16::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i16x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4081,7 +4006,18 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    vpcmpuw256(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i16x16::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i16x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4096,7 +4032,17 @@ pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    vpcmpuw128(a, b, IMM8, 0b11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i16x8::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i16x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4115,7 +4061,18 @@ pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    vpcmpuw128(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i16x8::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i16x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4130,12 +4087,17 @@ pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    vpcmpub(
-        a,
-        b,
-        IMM8,
-        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    )
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i8x64::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i8x64::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4154,7 +4116,18 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    vpcmpub(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i8x64::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i8x64::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4169,7 +4142,17 @@ pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i8x32::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i8x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4188,7 +4171,18 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    vpcmpub256(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i8x32::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i8x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4203,7 +4197,17 @@ pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    vpcmpub128(a, b, IMM8, 0b11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i8x16::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i8x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4222,7 +4226,18 @@ pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    vpcmpub128(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i8x16::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i8x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4237,7 +4252,17 @@ pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i16x32::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i16x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4256,7 +4281,18 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    vpcmpw(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i16x32::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i16x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4271,7 +4307,17 @@ pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    vpcmpw256(a, b, IMM8, 0b11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i16x16::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i16x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4290,7 +4336,18 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    vpcmpw256(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i16x16::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i16x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4305,7 +4362,17 @@ pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    vpcmpw128(a, b, IMM8, 0b11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i16x8::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i16x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4324,7 +4391,18 @@ pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    vpcmpw128(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i16x8::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i16x8::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4339,12 +4417,17 @@ pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    vpcmpb(
-        a,
-        b,
-        IMM8,
-        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-    )
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i8x64::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i8x64::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4363,7 +4446,18 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    vpcmpb(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i8x64::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i8x64::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4378,7 +4472,17 @@ pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i8x32::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i8x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4397,7 +4501,18 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    vpcmpb256(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i8x32::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i8x32::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4412,7 +4527,17 @@ pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    vpcmpb128(a, b, IMM8, 0b11111111_11111111)
+    let r = match IMM8 {
+        0 => simd_eq(a, b),
+        1 => simd_lt(a, b),
+        2 => simd_le(a, b),
+        3 => i8x16::splat(0),
+        4 => simd_ne(a, b),
+        5 => simd_ge(a, b),
+        6 => simd_gt(a, b),
+        _ => i8x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4431,139 +4556,838 @@ pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
     static_assert_uimm_bits!(IMM8, 3);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    vpcmpb128(a, b, IMM8, k1)
+    let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::splat(0));
+    let r = match IMM8 {
+        0 => simd_and(k1, simd_eq(a, b)),
+        1 => simd_and(k1, simd_lt(a, b)),
+        2 => simd_and(k1, simd_le(a, b)),
+        3 => i8x16::splat(0),
+        4 => simd_and(k1, simd_ne(a, b)),
+        5 => simd_and(k1, simd_ge(a, b)),
+        6 => simd_and(k1, simd_gt(a, b)),
+        _ => i8x16::splat(-1),
+    };
+    simd_bitmask(r)
 }
 
-/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi16&expand=3368)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi16)
 #[inline]
-#[target_feature(enable = "avx512bw")]
+#[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i {
-    ptr::read_unaligned(mem_addr as *const __m512i)
+pub unsafe fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
+    simd_reduce_add_unordered(a.as_i16x16())
 }
 
-/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi16&expand=3365)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi16)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i {
-    ptr::read_unaligned(mem_addr as *const __m256i)
+pub unsafe fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+    ))
 }
 
-/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi16&expand=3362)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi16)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i {
-    ptr::read_unaligned(mem_addr as *const __m128i)
+pub unsafe fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
+    simd_reduce_add_unordered(a.as_i16x8())
 }
 
-/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi8&expand=3395)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi16)
 #[inline]
-#[target_feature(enable = "avx512bw")]
+#[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i {
-    ptr::read_unaligned(mem_addr as *const __m512i)
+pub unsafe fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+    ))
 }
 
-/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi8&expand=3392)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi8)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i {
-    ptr::read_unaligned(mem_addr as *const __m256i)
+pub unsafe fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
+    simd_reduce_add_unordered(a.as_i8x32())
 }
 
-/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi8&expand=3389)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi8)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i {
-    ptr::read_unaligned(mem_addr as *const __m128i)
+pub unsafe fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+    ))
 }
 
-/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi16&expand=5622)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi8)
 #[inline]
-#[target_feature(enable = "avx512bw")]
+#[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) {
-    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+pub unsafe fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
+    simd_reduce_add_unordered(a.as_i8x16())
 }
 
-/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi16&expand=5620)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi8)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) {
-    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+pub unsafe fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+    ))
 }
 
-/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi16&expand=5618)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi16)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
-pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) {
-    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+pub unsafe fn _mm256_reduce_and_epi16(a: __m256i) -> i16 {
+    simd_reduce_and(a.as_i16x16())
 }
 
-/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi8&expand=5640)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi16)
 #[inline]
-#[target_feature(enable = "avx512bw")]
+#[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) {
-    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+pub unsafe fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 {
+    simd_reduce_and(simd_select_bitmask(
+        k,
+        a.as_i16x16(),
+        _mm256_set1_epi64x(-1).as_i16x16(),
+    ))
 }
 
-/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi8&expand=5638)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi16)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) {
-    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+pub unsafe fn _mm_reduce_and_epi16(a: __m128i) -> i16 {
+    simd_reduce_and(a.as_i16x8())
 }
 
-/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi8&expand=5636)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi16)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
-pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
-    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+pub unsafe fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 {
+    simd_reduce_and(simd_select_bitmask(
+        k,
+        a.as_i16x8(),
+        _mm_set1_epi64x(-1).as_i16x8(),
+    ))
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_and_epi8(a: __m256i) -> i8 {
+    simd_reduce_and(a.as_i8x32())
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 {
+    simd_reduce_and(simd_select_bitmask(
+        k,
+        a.as_i8x32(),
+        _mm256_set1_epi64x(-1).as_i8x32(),
+    ))
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_and_epi8(a: __m128i) -> i8 {
+    simd_reduce_and(a.as_i8x16())
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 {
+    simd_reduce_and(simd_select_bitmask(
+        k,
+        a.as_i8x16(),
+        _mm_set1_epi64x(-1).as_i8x16(),
+    ))
+}
+
+/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_max_epi16(a: __m256i) -> i16 {
+    simd_reduce_max(a.as_i16x16())
+}
+
+/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(-32768)))
+}
+
+/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_max_epi16(a: __m128i) -> i16 {
+    simd_reduce_max(a.as_i16x8())
+}
+
+/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(-32768)))
+}
+
+/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_max_epi8(a: __m256i) -> i8 {
+    simd_reduce_max(a.as_i8x32())
+}
+
+/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(-128)))
+}
+
+/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_max_epi8(a: __m128i) -> i8 {
+    simd_reduce_max(a.as_i8x16())
+}
+
+/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(-128)))
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_max_epu16(a: __m256i) -> u16 {
+    simd_reduce_max(a.as_u16x16())
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0)))
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_max_epu16(a: __m128i) -> u16 {
+    simd_reduce_max(a.as_u16x8())
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0)))
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_max_epu8(a: __m256i) -> u8 {
+    simd_reduce_max(a.as_u8x32())
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0)))
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_max_epu8(a: __m128i) -> u8 {
+    simd_reduce_max(a.as_u8x16())
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 {
+    simd_reduce_max(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0)))
+}
+
+/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_min_epi16(a: __m256i) -> i16 {
+    simd_reduce_min(a.as_i16x16())
+}
+
+/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(0x7fff)))
+}
+
+/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_min_epi16(a: __m128i) -> i16 {
+    simd_reduce_min(a.as_i16x8())
+}
+
+/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(0x7fff)))
+}
+
+/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_min_epi8(a: __m256i) -> i8 {
+    simd_reduce_min(a.as_i8x32())
+}
+
+/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(0x7f)))
+}
+
+/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_min_epi8(a: __m128i) -> i8 {
+    simd_reduce_min(a.as_i8x16())
+}
+
+/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(0x7f)))
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_min_epu16(a: __m256i) -> u16 {
+    simd_reduce_min(a.as_u16x16())
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0xffff)))
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_min_epu16(a: __m128i) -> u16 {
+    simd_reduce_min(a.as_u16x8())
+}
+
+/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0xffff)))
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_min_epu8(a: __m256i) -> u8 {
+    simd_reduce_min(a.as_u8x32())
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0xff)))
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_min_epu8(a: __m128i) -> u8 {
+    simd_reduce_min(a.as_u8x16())
+}
+
+/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
+    simd_reduce_min(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0xff)))
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
+    simd_reduce_mul_unordered(a.as_i16x16())
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
+    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1)))
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
+    simd_reduce_mul_unordered(a.as_i16x8())
+}
+
+/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
+    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1)))
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
+    simd_reduce_mul_unordered(a.as_i8x32())
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
+    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1)))
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
+    simd_reduce_mul_unordered(a.as_i8x16())
+}
+
+/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
+    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1)))
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_or_epi16(a: __m256i) -> i16 {
+    simd_reduce_or(a.as_i16x16())
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 {
+    simd_reduce_or(simd_select_bitmask(
+        k,
+        a.as_i16x16(),
+        _mm256_setzero_si256().as_i16x16(),
+    ))
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_or_epi16(a: __m128i) -> i16 {
+    simd_reduce_or(a.as_i16x8())
+}
+
+/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi16)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 {
+    simd_reduce_or(simd_select_bitmask(
+        k,
+        a.as_i16x8(),
+        _mm_setzero_si128().as_i16x8(),
+    ))
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_reduce_or_epi8(a: __m256i) -> i8 {
+    simd_reduce_or(a.as_i8x32())
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 {
+    simd_reduce_or(simd_select_bitmask(
+        k,
+        a.as_i8x32(),
+        _mm256_setzero_si256().as_i8x32(),
+    ))
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_reduce_or_epi8(a: __m128i) -> i8 {
+    simd_reduce_or(a.as_i8x16())
+}
+
+/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi8)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_mask_reduce_or_epi8(k: __mmask16, a: __m128i) -> i8 {
+    simd_reduce_or(simd_select_bitmask(
+        k,
+        a.as_i8x16(),
+        _mm_setzero_si128().as_i8x16(),
+    ))
+}
+
+/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi16&expand=3368)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi16&expand=3365)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi16&expand=3362)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi8&expand=3395)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i {
+    ptr::read_unaligned(mem_addr as *const __m512i)
+}
+
+/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi8&expand=3392)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i {
+    ptr::read_unaligned(mem_addr as *const __m256i)
+}
+
+/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi8&expand=3389)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i {
+    ptr::read_unaligned(mem_addr as *const __m128i)
+}
+
+/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi16&expand=5622)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi16&expand=5620)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi16&expand=5618)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16
+pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
+}
+
+/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi8&expand=5640)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) {
+    ptr::write_unaligned(mem_addr as *mut __m512i, a);
+}
+
+/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi8&expand=5638)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) {
+    ptr::write_unaligned(mem_addr as *mut __m256i, a);
+}
+
+/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi8&expand=5636)
+#[inline]
+#[target_feature(enable = "avx512bw,avx512vl")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8
+pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) {
+    ptr::write_unaligned(mem_addr as *mut __m128i, a);
 }
 
 /// Load packed 16-bit integers from memory into dst using writemask k
@@ -5505,7 +6329,10 @@ pub unsafe fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
 pub unsafe fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpavgw(a.as_u16x32(), b.as_u16x32()))
+    let a = simd_cast::<_, u32x32>(a.as_u16x32());
+    let b = simd_cast::<_, u32x32>(b.as_u16x32());
+    let r = simd_shr(simd_add(simd_add(a, b), u32x32::splat(1)), u32x32::splat(1));
+    transmute(simd_cast::<_, u16x32>(r))
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5591,7 +6418,10 @@ pub unsafe fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
 pub unsafe fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpavgb(a.as_u8x64(), b.as_u8x64()))
+    let a = simd_cast::<_, u16x64>(a.as_u8x64());
+    let b = simd_cast::<_, u16x64>(b.as_u8x64());
+    let r = simd_shr(simd_add(simd_add(a, b), u16x64::splat(1)), u16x64::splat(1));
+    transmute(simd_cast::<_, u8x64>(r))
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -9221,6 +10051,26 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
     transmute(simd_select_bitmask(k, one, zero))
 }
 
+/// Convert 32-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#_cvtmask32_u32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _cvtmask32_u32(a: __mmask32) -> u32 {
+    a
+}
+
+/// Convert integer value a into an 32-bit mask, and store the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _cvtu32_mask32(a: u32) -> __mmask32 {
+    a
+}
+
 /// Add 32-bit masks in a and b, and store the result in k.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask32&expand=3207)
@@ -9257,108 +10107,314 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    a & b
+pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a & b
+}
+
+/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask32&expand=3234)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
+    !a
+}
+
+/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask64&expand=3235)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
+    !a
+}
+
+/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask32&expand=3219)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    _knot_mask32(a) & b
+}
+
+/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask64&expand=3220)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    _knot_mask64(a) & b
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask32&expand=3240)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a | b
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask64&expand=3241)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a | b
+}
+
+/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask32&expand=3292)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    a ^ b
+}
+
+/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask64&expand=3293)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    a ^ b
+}
+
+/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask32&expand=3286)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+    _knot_mask32(a ^ b)
+}
+
+/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask64&expand=3287)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+    _knot_mask64(a ^ b)
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortest_mask32_u8(a: __mmask32, b: __mmask32, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask32(a, b);
+    *all_ones = (tmp == 0xffffffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortest_mask64_u8(a: __mmask64, b: __mmask64, all_ones: *mut u8) -> u8 {
+    let tmp = _kor_mask64(a, b);
+    *all_ones = (tmp == 0xffffffff_ffffffff) as u8;
+    (tmp == 0) as u8
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kor_mask32(a, b) == 0xffffffff) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all ones, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kor_mask64(a, b) == 0xffffffff_ffffffff) as u8
+}
+
+/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask32_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kor_mask32(a, b) == 0) as u8
+}
+
+/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask64_u8)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kor_mask64(a, b) == 0) as u8
+}
+
+/// Shift the bits of 32-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask32)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+    a << COUNT
+}
+
+/// Shift the bits of 64-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+    a << COUNT
 }
 
-/// Compute the bitwise NOT of 32-bit mask a, and store the result in k.
+/// Shift the bits of 32-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask32&expand=3234)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask32)
 #[inline]
 #[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
-    a ^ 0b11111111_11111111_11111111_11111111
+pub unsafe fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+    a >> COUNT
 }
 
-/// Compute the bitwise NOT of 64-bit mask a, and store the result in k.
+/// Shift the bits of 64-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask64&expand=3235)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask64)
 #[inline]
 #[target_feature(enable = "avx512bw")]
+#[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
-    a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+pub unsafe fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+    a >> COUNT
 }
 
-/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k.
+/// Compute the bitwise AND of 32-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask32&expand=3219)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask32_u8)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    _knot_mask32(a) & b
+pub unsafe fn _ktest_mask32_u8(a: __mmask32, b: __mmask32, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask32(a, b) == 0) as u8;
+    (_kand_mask32(a, b) == 0) as u8
 }
 
-/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k.
+/// Compute the bitwise AND of 64-bit masks a and b, and if the result is all zeros, store 1 in dst,
+/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all
+/// zeros, store 1 in and_not, otherwise store 0 in and_not.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask64&expand=3220)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask64_u8)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    _knot_mask64(a) & b
+pub unsafe fn _ktest_mask64_u8(a: __mmask64, b: __mmask64, and_not: *mut u8) -> u8 {
+    *and_not = (_kandn_mask64(a, b) == 0) as u8;
+    (_kand_mask64(a, b) == 0) as u8
 }
 
-/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k.
+/// Compute the bitwise NOT of 32-bit mask a and then AND with 16-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask32&expand=3240)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask32_u8)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    a | b
+pub unsafe fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kandn_mask32(a, b) == 0) as u8
 }
 
-/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k.
+/// Compute the bitwise NOT of 64-bit mask a and then AND with 8-bit mask b, if the result is all
+/// zeros, store 1 in dst, otherwise store 0 in dst.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask64&expand=3241)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask64_u8)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    a | b
+pub unsafe fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kandn_mask64(a, b) == 0) as u8
 }
 
-/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k.
+/// Compute the bitwise AND of 32-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask32&expand=3292)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask32_u8)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    a ^ b
+pub unsafe fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+    (_kand_mask32(a, b) == 0) as u8
 }
 
-/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k.
+/// Compute the bitwise AND of 64-bit masks a and  b, if the result is all zeros, store 1 in dst, otherwise
+/// store 0 in dst.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask64&expand=3293)
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask64_u8)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    a ^ b
+pub unsafe fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+    (_kand_mask64(a, b) == 0) as u8
 }
 
-/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k.
+/// Unpack and interleave 16 bits from masks a and b, and store the 32-bit result in k.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask32&expand=3286)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=512_kunpackw)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
-    _knot_mask32(a ^ b)
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckwd
+pub unsafe fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 {
+    ((a & 0xffff) << 16) | (b & 0xffff)
 }
 
-/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k.
+/// Unpack and interleave 32 bits from masks a and b, and store the 64-bit result in k.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask64&expand=3287)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=512_kunpackd)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
-    _knot_mask64(a ^ b)
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckdq
+pub unsafe fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 {
+    ((a & 0xffffffff) << 32) | (b & 0xffffffff)
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -10589,115 +11645,9 @@ pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a:
 
 #[allow(improper_ctypes)]
 extern "C" {
-    #[link_name = "llvm.x86.avx512.mask.paddus.w.512"]
-    fn vpaddusw(a: u16x32, b: u16x32, src: u16x32, mask: u32) -> u16x32;
-    #[link_name = "llvm.x86.avx512.mask.paddus.w.256"]
-    fn vpaddusw256(a: u16x16, b: u16x16, src: u16x16, mask: u16) -> u16x16;
-    #[link_name = "llvm.x86.avx512.mask.paddus.w.128"]
-    fn vpaddusw128(a: u16x8, b: u16x8, src: u16x8, mask: u8) -> u16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.paddus.b.512"]
-    fn vpaddusb(a: u8x64, b: u8x64, src: u8x64, mask: u64) -> u8x64;
-    #[link_name = "llvm.x86.avx512.mask.paddus.b.256"]
-    fn vpaddusb256(a: u8x32, b: u8x32, src: u8x32, mask: u32) -> u8x32;
-    #[link_name = "llvm.x86.avx512.mask.paddus.b.128"]
-    fn vpaddusb128(a: u8x16, b: u8x16, src: u8x16, mask: u16) -> u8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.padds.w.512"]
-    fn vpaddsw(a: i16x32, b: i16x32, src: i16x32, mask: u32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.mask.padds.w.256"]
-    fn vpaddsw256(a: i16x16, b: i16x16, src: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.padds.w.128"]
-    fn vpaddsw128(a: i16x8, b: i16x8, src: i16x8, mask: u8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.padds.b.512"]
-    fn vpaddsb(a: i8x64, b: i8x64, src: i8x64, mask: u64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.mask.padds.b.256"]
-    fn vpaddsb256(a: i8x32, b: i8x32, src: i8x32, mask: u32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.mask.padds.b.128"]
-    fn vpaddsb128(a: i8x16, b: i8x16, src: i8x16, mask: u16) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.psubus.w.512"]
-    fn vpsubusw(a: u16x32, b: u16x32, src: u16x32, mask: u32) -> u16x32;
-    #[link_name = "llvm.x86.avx512.mask.psubus.w.256"]
-    fn vpsubusw256(a: u16x16, b: u16x16, src: u16x16, mask: u16) -> u16x16;
-    #[link_name = "llvm.x86.avx512.mask.psubus.w.128"]
-    fn vpsubusw128(a: u16x8, b: u16x8, src: u16x8, mask: u8) -> u16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.psubus.b.512"]
-    fn vpsubusb(a: u8x64, b: u8x64, src: u8x64, mask: u64) -> u8x64;
-    #[link_name = "llvm.x86.avx512.mask.psubus.b.256"]
-    fn vpsubusb256(a: u8x32, b: u8x32, src: u8x32, mask: u32) -> u8x32;
-    #[link_name = "llvm.x86.avx512.mask.psubus.b.128"]
-    fn vpsubusb128(a: u8x16, b: u8x16, src: u8x16, mask: u16) -> u8x16;
-
-    #[link_name = "llvm.x86.avx512.mask.psubs.w.512"]
-    fn vpsubsw(a: i16x32, b: i16x32, src: i16x32, mask: u32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.mask.psubs.w.256"]
-    fn vpsubsw256(a: i16x16, b: i16x16, src: i16x16, mask: u16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.mask.psubs.w.128"]
-    fn vpsubsw128(a: i16x8, b: i16x8, src: i16x8, mask: u8) -> i16x8;
-
-    #[link_name = "llvm.x86.avx512.mask.psubs.b.512"]
-    fn vpsubsb(a: i8x64, b: i8x64, src: i8x64, mask: u64) -> i8x64;
-    #[link_name = "llvm.x86.avx512.mask.psubs.b.256"]
-    fn vpsubsb256(a: i8x32, b: i8x32, src: i8x32, mask: u32) -> i8x32;
-    #[link_name = "llvm.x86.avx512.mask.psubs.b.128"]
-    fn vpsubsb128(a: i8x16, b: i8x16, src: i8x16, mask: u16) -> i8x16;
-
-    #[link_name = "llvm.x86.avx512.pmulhu.w.512"]
-    fn vpmulhuw(a: u16x32, b: u16x32) -> u16x32;
-    #[link_name = "llvm.x86.avx512.pmulh.w.512"]
-    fn vpmulhw(a: i16x32, b: i16x32) -> i16x32;
     #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
     fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.mask.ucmp.w.512"]
-    fn vpcmpuw(a: u16x32, b: u16x32, op: i32, mask: u32) -> u32;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.w.256"]
-    fn vpcmpuw256(a: u16x16, b: u16x16, op: i32, mask: u16) -> u16;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.w.128"]
-    fn vpcmpuw128(a: u16x8, b: u16x8, op: i32, mask: u8) -> u8;
-
-    #[link_name = "llvm.x86.avx512.mask.ucmp.b.512"]
-    fn vpcmpub(a: u8x64, b: u8x64, op: i32, mask: u64) -> u64;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.b.256"]
-    fn vpcmpub256(a: u8x32, b: u8x32, op: i32, mask: u32) -> u32;
-    #[link_name = "llvm.x86.avx512.mask.ucmp.b.128"]
-    fn vpcmpub128(a: u8x16, b: u8x16, op: i32, mask: u16) -> u16;
-
-    #[link_name = "llvm.x86.avx512.mask.cmp.w.512"]
-    fn vpcmpw(a: i16x32, b: i16x32, op: i32, mask: u32) -> u32;
-    #[link_name = "llvm.x86.avx512.mask.cmp.w.256"]
-    fn vpcmpw256(a: i16x16, b: i16x16, op: i32, mask: u16) -> u16;
-    #[link_name = "llvm.x86.avx512.mask.cmp.w.128"]
-    fn vpcmpw128(a: i16x8, b: i16x8, op: i32, mask: u8) -> u8;
-
-    #[link_name = "llvm.x86.avx512.mask.cmp.b.512"]
-    fn vpcmpb(a: i8x64, b: i8x64, op: i32, mask: u64) -> u64;
-    #[link_name = "llvm.x86.avx512.mask.cmp.b.256"]
-    fn vpcmpb256(a: i8x32, b: i8x32, op: i32, mask: u32) -> u32;
-    #[link_name = "llvm.x86.avx512.mask.cmp.b.128"]
-    fn vpcmpb128(a: i8x16, b: i8x16, op: i32, mask: u16) -> u16;
-
-    #[link_name = "llvm.x86.avx512.mask.pmaxu.w.512"]
-    fn vpmaxuw(a: u16x32, b: u16x32) -> u16x32;
-    #[link_name = "llvm.x86.avx512.mask.pmaxu.b.512"]
-    fn vpmaxub(a: u8x64, b: u8x64) -> u8x64;
-    #[link_name = "llvm.x86.avx512.mask.pmaxs.w.512"]
-    fn vpmaxsw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.mask.pmaxs.b.512"]
-    fn vpmaxsb(a: i8x64, b: i8x64) -> i8x64;
-
-    #[link_name = "llvm.x86.avx512.mask.pminu.w.512"]
-    fn vpminuw(a: u16x32, b: u16x32) -> u16x32;
-    #[link_name = "llvm.x86.avx512.mask.pminu.b.512"]
-    fn vpminub(a: u8x64, b: u8x64) -> u8x64;
-    #[link_name = "llvm.x86.avx512.mask.pmins.w.512"]
-    fn vpminsw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.mask.pmins.b.512"]
-    fn vpminsb(a: i8x64, b: i8x64) -> i8x64;
-
     #[link_name = "llvm.x86.avx512.pmaddw.d.512"]
     fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
     #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
@@ -10712,11 +11662,6 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.packuswb.512"]
     fn vpackuswb(a: i16x32, b: i16x32) -> u8x64;
 
-    #[link_name = "llvm.x86.avx512.pavg.w.512"]
-    fn vpavgw(a: u16x32, b: u16x32) -> u16x32;
-    #[link_name = "llvm.x86.avx512.pavg.b.512"]
-    fn vpavgb(a: u8x64, b: u8x64) -> u8x64;
-
     #[link_name = "llvm.x86.avx512.psll.w.512"]
     fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
 
@@ -13754,11 +14699,227 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmple_epu8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
+    unsafe fn test_mm512_mask_cmple_epu8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epu8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epu8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epu8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epu8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi16_mask() {
+        let a = _mm512_set1_epi16(-1);
+        let b = _mm512_set1_epi16(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmple_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmple_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmple_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmple_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmple_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmple_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmple_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmple_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu16_mask() {
+        let a = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpge_epu16_mask() {
+        let a = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpge_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpge_epu16_mask() {
+        let a = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpge_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpge_epu8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpge_epu8_mask() {
+        let a = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmple_epu8_mask(mask, a, b);
+        let r = _mm512_mask_cmpge_epu8_mask(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
@@ -13766,95 +14927,95 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmple_epu8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmple_epu8_mask(a, b);
+    unsafe fn test_mm256_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpge_epu8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epu8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
+    unsafe fn test_mm256_mask_cmpge_epu8_mask() {
+        let a = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmple_epu8_mask(mask, a, b);
+        let r = _mm256_mask_cmpge_epu8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmple_epu8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmple_epu8_mask(a, b);
+    unsafe fn test_mm_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpge_epu8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epu8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
+    unsafe fn test_mm_mask_cmpge_epu8_mask() {
+        let a = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmple_epu8_mask(mask, a, b);
+        let r = _mm_mask_cmpge_epu8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmple_epi16_mask() {
+    unsafe fn test_mm512_cmpge_epi16_mask() {
         let a = _mm512_set1_epi16(-1);
         let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmple_epi16_mask(a, b);
+        let m = _mm512_cmpge_epi16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmple_epi16_mask() {
+    unsafe fn test_mm512_mask_cmpge_epi16_mask() {
         let a = _mm512_set1_epi16(-1);
         let b = _mm512_set1_epi16(-1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmple_epi16_mask(mask, a, b);
+        let r = _mm512_mask_cmpge_epi16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmple_epi16_mask() {
+    unsafe fn test_mm256_cmpge_epi16_mask() {
         let a = _mm256_set1_epi16(-1);
         let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmple_epi16_mask(a, b);
+        let m = _mm256_cmpge_epi16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epi16_mask() {
+    unsafe fn test_mm256_mask_cmpge_epi16_mask() {
         let a = _mm256_set1_epi16(-1);
         let b = _mm256_set1_epi16(-1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmple_epi16_mask(mask, a, b);
+        let r = _mm256_mask_cmpge_epi16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmple_epi16_mask() {
+    unsafe fn test_mm_cmpge_epi16_mask() {
         let a = _mm_set1_epi16(-1);
         let b = _mm_set1_epi16(-1);
-        let m = _mm_cmple_epi16_mask(a, b);
+        let m = _mm_cmpge_epi16_mask(a, b);
         assert_eq!(m, 0b11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epi16_mask() {
+    unsafe fn test_mm_mask_cmpge_epi16_mask() {
         let a = _mm_set1_epi16(-1);
         let b = _mm_set1_epi16(-1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmple_epi16_mask(mask, a, b);
+        let r = _mm_mask_cmpge_epi16_mask(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmple_epi8_mask() {
+    unsafe fn test_mm512_cmpge_epi8_mask() {
         let a = _mm512_set1_epi8(-1);
         let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmple_epi8_mask(a, b);
+        let m = _mm512_cmpge_epi8_mask(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
@@ -13862,11 +15023,11 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmple_epi8_mask() {
+    unsafe fn test_mm512_mask_cmpge_epi8_mask() {
         let a = _mm512_set1_epi8(-1);
         let b = _mm512_set1_epi8(-1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmple_epi8_mask(mask, a, b);
+        let r = _mm512_mask_cmpge_epi8_mask(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
@@ -13874,95 +15035,95 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmple_epi8_mask() {
+    unsafe fn test_mm256_cmpge_epi8_mask() {
         let a = _mm256_set1_epi8(-1);
         let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmple_epi8_mask(a, b);
+        let m = _mm256_cmpge_epi8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmple_epi8_mask() {
+    unsafe fn test_mm256_mask_cmpge_epi8_mask() {
         let a = _mm256_set1_epi8(-1);
         let b = _mm256_set1_epi8(-1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmple_epi8_mask(mask, a, b);
+        let r = _mm256_mask_cmpge_epi8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmple_epi8_mask() {
+    unsafe fn test_mm_cmpge_epi8_mask() {
         let a = _mm_set1_epi8(-1);
         let b = _mm_set1_epi8(-1);
-        let m = _mm_cmple_epi8_mask(a, b);
+        let m = _mm_cmpge_epi8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmple_epi8_mask() {
+    unsafe fn test_mm_mask_cmpge_epi8_mask() {
         let a = _mm_set1_epi8(-1);
         let b = _mm_set1_epi8(-1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmple_epi8_mask(mask, a, b);
+        let r = _mm_mask_cmpge_epi8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epu16_mask() {
+    unsafe fn test_mm512_cmpeq_epu16_mask() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmpge_epu16_mask(a, b);
+        let m = _mm512_cmpeq_epu16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epu16_mask() {
+    unsafe fn test_mm512_mask_cmpeq_epu16_mask() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epu16_mask(mask, a, b);
+        let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epu16_mask() {
+    unsafe fn test_mm256_cmpeq_epu16_mask() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmpge_epu16_mask(a, b);
+        let m = _mm256_cmpeq_epu16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epu16_mask() {
+    unsafe fn test_mm256_mask_cmpeq_epu16_mask() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpge_epu16_mask(mask, a, b);
+        let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epu16_mask() {
+    unsafe fn test_mm_cmpeq_epu16_mask() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1);
-        let m = _mm_cmpge_epu16_mask(a, b);
+        let m = _mm_cmpeq_epu16_mask(a, b);
         assert_eq!(m, 0b11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epu16_mask() {
+    unsafe fn test_mm_mask_cmpeq_epu16_mask() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmpge_epu16_mask(mask, a, b);
+        let r = _mm_mask_cmpeq_epu16_mask(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epu8_mask() {
+    unsafe fn test_mm512_cmpeq_epu8_mask() {
         let a = _mm512_set1_epi8(1);
         let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmpge_epu8_mask(a, b);
+        let m = _mm512_cmpeq_epu8_mask(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
@@ -13970,11 +15131,11 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epu8_mask() {
+    unsafe fn test_mm512_mask_cmpeq_epu8_mask() {
         let a = _mm512_set1_epi8(1);
         let b = _mm512_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epu8_mask(mask, a, b);
+        let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
@@ -13982,95 +15143,203 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epu8_mask() {
+    unsafe fn test_mm256_cmpeq_epu8_mask() {
         let a = _mm256_set1_epi8(1);
         let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmpge_epu8_mask(a, b);
+        let m = _mm256_cmpeq_epu8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epu8_mask() {
+    unsafe fn test_mm256_mask_cmpeq_epu8_mask() {
         let a = _mm256_set1_epi8(1);
         let b = _mm256_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpge_epu8_mask(mask, a, b);
+        let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epu8_mask() {
+    unsafe fn test_mm_cmpeq_epu8_mask() {
         let a = _mm_set1_epi8(1);
         let b = _mm_set1_epi8(1);
-        let m = _mm_cmpge_epu8_mask(a, b);
+        let m = _mm_cmpeq_epu8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epu8_mask() {
+    unsafe fn test_mm_mask_cmpeq_epu8_mask() {
         let a = _mm_set1_epi8(1);
         let b = _mm_set1_epi8(1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpge_epu8_mask(mask, a, b);
+        let r = _mm_mask_cmpeq_epu8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epi16_mask() {
+    unsafe fn test_mm512_cmpeq_epi16_mask() {
         let a = _mm512_set1_epi16(-1);
         let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmpge_epi16_mask(a, b);
+        let m = _mm512_cmpeq_epi16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epi16_mask() {
+    unsafe fn test_mm512_mask_cmpeq_epi16_mask() {
         let a = _mm512_set1_epi16(-1);
         let b = _mm512_set1_epi16(-1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epi16_mask(mask, a, b);
+        let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epi16_mask() {
+    unsafe fn test_mm256_cmpeq_epi16_mask() {
         let a = _mm256_set1_epi16(-1);
         let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmpge_epi16_mask(a, b);
+        let m = _mm256_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi16_mask() {
+        let a = _mm256_set1_epi16(-1);
+        let b = _mm256_set1_epi16(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpeq_epi16_mask(a, b);
+        assert_eq!(m, 0b11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi16_mask() {
+        let a = _mm_set1_epi16(-1);
+        let b = _mm_set1_epi16(-1);
+        let mask = 0b01010101;
+        let r = _mm_mask_cmpeq_epi16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpeq_epi8_mask(a, b);
+        assert_eq!(
+            m,
+            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+        );
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpeq_epi8_mask() {
+        let a = _mm512_set1_epi8(-1);
+        let b = _mm512_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(
+            r,
+            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+        );
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_cmpeq_epi8_mask() {
+        let a = _mm256_set1_epi8(-1);
+        let b = _mm256_set1_epi8(-1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpeq_epi8_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_cmpeq_epi8_mask() {
+        let a = _mm_set1_epi8(-1);
+        let b = _mm_set1_epi8(-1);
+        let mask = 0b01010101_01010101;
+        let r = _mm_mask_cmpeq_epi8_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmpneq_epu16_mask(a, b);
+        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_mask_cmpneq_epu16_mask() {
+        let a = _mm512_set1_epi16(2);
+        let b = _mm512_set1_epi16(1);
+        let mask = 0b01010101_01010101_01010101_01010101;
+        let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b);
+        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmpneq_epu16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
+    unsafe fn test_mm256_mask_cmpneq_epu16_mask() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpge_epi16_mask(mask, a, b);
+        let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmpge_epi16_mask(a, b);
+    unsafe fn test_mm_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmpneq_epu16_mask(a, b);
         assert_eq!(m, 0b11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
+    unsafe fn test_mm_mask_cmpneq_epu16_mask() {
+        let a = _mm_set1_epi16(2);
+        let b = _mm_set1_epi16(1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmpge_epi16_mask(mask, a, b);
+        let r = _mm_mask_cmpneq_epu16_mask(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpge_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmpge_epi8_mask(a, b);
+    unsafe fn test_mm512_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmpneq_epu8_mask(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
@@ -14078,11 +15347,11 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpge_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
+    unsafe fn test_mm512_mask_cmpneq_epu8_mask() {
+        let a = _mm512_set1_epi8(2);
+        let b = _mm512_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpge_epi8_mask(mask, a, b);
+        let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
@@ -14090,95 +15359,95 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpge_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmpge_epi8_mask(a, b);
+    unsafe fn test_mm256_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmpneq_epu8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpge_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
+    unsafe fn test_mm256_mask_cmpneq_epu8_mask() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpge_epi8_mask(mask, a, b);
+        let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpge_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmpge_epi8_mask(a, b);
+    unsafe fn test_mm_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmpneq_epu8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpge_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
+    unsafe fn test_mm_mask_cmpneq_epu8_mask() {
+        let a = _mm_set1_epi8(2);
+        let b = _mm_set1_epi8(1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpge_epi8_mask(mask, a, b);
+        let r = _mm_mask_cmpneq_epu8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epu16_mask() {
+    unsafe fn test_mm512_cmpneq_epi16_mask() {
         let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmpeq_epu16_mask(a, b);
+        let b = _mm512_set1_epi16(-1);
+        let m = _mm512_cmpneq_epi16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epu16_mask() {
+    unsafe fn test_mm512_mask_cmpneq_epi16_mask() {
         let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(1);
+        let b = _mm512_set1_epi16(-1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b);
+        let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epu16_mask() {
+    unsafe fn test_mm256_cmpneq_epi16_mask() {
         let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmpeq_epu16_mask(a, b);
+        let b = _mm256_set1_epi16(-1);
+        let m = _mm256_cmpneq_epi16_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epu16_mask() {
+    unsafe fn test_mm256_mask_cmpneq_epi16_mask() {
         let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(1);
+        let b = _mm256_set1_epi16(-1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b);
+        let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epu16_mask() {
+    unsafe fn test_mm_cmpneq_epi16_mask() {
         let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmpeq_epu16_mask(a, b);
+        let b = _mm_set1_epi16(-1);
+        let m = _mm_cmpneq_epi16_mask(a, b);
         assert_eq!(m, 0b11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epu16_mask() {
+    unsafe fn test_mm_mask_cmpneq_epi16_mask() {
         let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(1);
+        let b = _mm_set1_epi16(-1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmpeq_epu16_mask(mask, a, b);
+        let r = _mm_mask_cmpneq_epi16_mask(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epu8_mask() {
+    unsafe fn test_mm512_cmpneq_epi8_mask() {
         let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmpeq_epu8_mask(a, b);
+        let b = _mm512_set1_epi8(-1);
+        let m = _mm512_cmpneq_epi8_mask(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
@@ -14186,11 +15455,11 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epu8_mask() {
+    unsafe fn test_mm512_mask_cmpneq_epi8_mask() {
         let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(1);
+        let b = _mm512_set1_epi8(-1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b);
+        let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
@@ -14198,95 +15467,95 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epu8_mask() {
+    unsafe fn test_mm256_cmpneq_epi8_mask() {
         let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmpeq_epu8_mask(a, b);
+        let b = _mm256_set1_epi8(-1);
+        let m = _mm256_cmpneq_epi8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epu8_mask() {
+    unsafe fn test_mm256_mask_cmpneq_epi8_mask() {
         let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(1);
+        let b = _mm256_set1_epi8(-1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b);
+        let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epu8_mask() {
+    unsafe fn test_mm_cmpneq_epi8_mask() {
         let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmpeq_epu8_mask(a, b);
+        let b = _mm_set1_epi8(-1);
+        let m = _mm_cmpneq_epi8_mask(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epu8_mask() {
+    unsafe fn test_mm_mask_cmpneq_epi8_mask() {
         let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(1);
+        let b = _mm_set1_epi8(-1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpeq_epu8_mask(mask, a, b);
+        let r = _mm_mask_cmpneq_epi8_mask(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmpeq_epi16_mask(a, b);
+    unsafe fn test_mm512_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
+        let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epi16_mask() {
-        let a = _mm512_set1_epi16(-1);
-        let b = _mm512_set1_epi16(-1);
+    unsafe fn test_mm512_mask_cmp_epu16_mask() {
+        let a = _mm512_set1_epi16(0);
+        let b = _mm512_set1_epi16(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b);
+        let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmpeq_epi16_mask(a, b);
+    unsafe fn test_mm256_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
+        let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epi16_mask() {
-        let a = _mm256_set1_epi16(-1);
-        let b = _mm256_set1_epi16(-1);
+    unsafe fn test_mm256_mask_cmp_epu16_mask() {
+        let a = _mm256_set1_epi16(0);
+        let b = _mm256_set1_epi16(1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b);
+        let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmpeq_epi16_mask(a, b);
+    unsafe fn test_mm_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
+        let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epi16_mask() {
-        let a = _mm_set1_epi16(-1);
-        let b = _mm_set1_epi16(-1);
+    unsafe fn test_mm_mask_cmp_epu16_mask() {
+        let a = _mm_set1_epi16(0);
+        let b = _mm_set1_epi16(1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmpeq_epi16_mask(mask, a, b);
+        let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpeq_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmpeq_epi8_mask(a, b);
+    unsafe fn test_mm512_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
+        let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
@@ -14294,11 +15563,11 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpeq_epi8_mask() {
-        let a = _mm512_set1_epi8(-1);
-        let b = _mm512_set1_epi8(-1);
+    unsafe fn test_mm512_mask_cmp_epu8_mask() {
+        let a = _mm512_set1_epi8(0);
+        let b = _mm512_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b);
+        let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
@@ -14306,95 +15575,95 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpeq_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmpeq_epi8_mask(a, b);
+    unsafe fn test_mm256_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
+        let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpeq_epi8_mask() {
-        let a = _mm256_set1_epi8(-1);
-        let b = _mm256_set1_epi8(-1);
+    unsafe fn test_mm256_mask_cmp_epu8_mask() {
+        let a = _mm256_set1_epi8(0);
+        let b = _mm256_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b);
+        let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpeq_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmpeq_epi8_mask(a, b);
+    unsafe fn test_mm_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
+        let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpeq_epi8_mask() {
-        let a = _mm_set1_epi8(-1);
-        let b = _mm_set1_epi8(-1);
+    unsafe fn test_mm_mask_cmp_epu8_mask() {
+        let a = _mm_set1_epi8(0);
+        let b = _mm_set1_epi8(1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpeq_epi8_mask(mask, a, b);
+        let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epu16_mask() {
-        let a = _mm512_set1_epi16(2);
+    unsafe fn test_mm512_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
         let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmpneq_epu16_mask(a, b);
+        let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epu16_mask() {
-        let a = _mm512_set1_epi16(2);
+    unsafe fn test_mm512_mask_cmp_epi16_mask() {
+        let a = _mm512_set1_epi16(0);
         let b = _mm512_set1_epi16(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b);
+        let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epu16_mask() {
-        let a = _mm256_set1_epi16(2);
+    unsafe fn test_mm256_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
         let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmpneq_epu16_mask(a, b);
+        let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epu16_mask() {
-        let a = _mm256_set1_epi16(2);
+    unsafe fn test_mm256_mask_cmp_epi16_mask() {
+        let a = _mm256_set1_epi16(0);
         let b = _mm256_set1_epi16(1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b);
+        let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epu16_mask() {
-        let a = _mm_set1_epi16(2);
+    unsafe fn test_mm_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
         let b = _mm_set1_epi16(1);
-        let m = _mm_cmpneq_epu16_mask(a, b);
+        let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epu16_mask() {
-        let a = _mm_set1_epi16(2);
+    unsafe fn test_mm_mask_cmp_epi16_mask() {
+        let a = _mm_set1_epi16(0);
         let b = _mm_set1_epi16(1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmpneq_epu16_mask(mask, a, b);
+        let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epu8_mask() {
-        let a = _mm512_set1_epi8(2);
+    unsafe fn test_mm512_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
         let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmpneq_epu8_mask(a, b);
+        let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
@@ -14402,11 +15671,11 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epu8_mask() {
-        let a = _mm512_set1_epi8(2);
+    unsafe fn test_mm512_mask_cmp_epi8_mask() {
+        let a = _mm512_set1_epi8(0);
         let b = _mm512_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b);
+        let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
@@ -14414,361 +15683,527 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epu8_mask() {
-        let a = _mm256_set1_epi8(2);
+    unsafe fn test_mm256_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
         let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmpneq_epu8_mask(a, b);
+        let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epu8_mask() {
-        let a = _mm256_set1_epi8(2);
+    unsafe fn test_mm256_mask_cmp_epi8_mask() {
+        let a = _mm256_set1_epi8(0);
         let b = _mm256_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b);
+        let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epu8_mask() {
-        let a = _mm_set1_epi8(2);
+    unsafe fn test_mm_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
         let b = _mm_set1_epi8(1);
-        let m = _mm_cmpneq_epu8_mask(a, b);
+        let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epu8_mask() {
-        let a = _mm_set1_epi8(2);
+    unsafe fn test_mm_mask_cmp_epi8_mask() {
+        let a = _mm_set1_epi8(0);
         let b = _mm_set1_epi8(1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpneq_epu8_mask(mask, a, b);
+        let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epi16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(-1);
-        let m = _mm512_cmpneq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
-    }
-
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epi16_mask() {
-        let a = _mm512_set1_epi16(1);
-        let b = _mm512_set1_epi16(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
-    }
-
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epi16_mask() {
+    unsafe fn test_mm256_reduce_add_epi16() {
         let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(-1);
-        let m = _mm256_cmpneq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
+        let e = _mm256_reduce_add_epi16(a);
+        assert_eq!(16, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epi16_mask() {
+    unsafe fn test_mm256_mask_reduce_add_epi16() {
         let a = _mm256_set1_epi16(1);
-        let b = _mm256_set1_epi16(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
+        let e = _mm256_mask_reduce_add_epi16(0b11111111_00000000, a);
+        assert_eq!(8, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epi16_mask() {
+    unsafe fn test_mm_reduce_add_epi16() {
         let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(-1);
-        let m = _mm_cmpneq_epi16_mask(a, b);
-        assert_eq!(m, 0b11111111);
+        let e = _mm_reduce_add_epi16(a);
+        assert_eq!(8, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epi16_mask() {
+    unsafe fn test_mm_mask_reduce_add_epi16() {
         let a = _mm_set1_epi16(1);
-        let b = _mm_set1_epi16(-1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmpneq_epi16_mask(mask, a, b);
-        assert_eq!(r, 0b01010101);
+        let e = _mm_mask_reduce_add_epi16(0b11110000, a);
+        assert_eq!(4, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmpneq_epi8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(-1);
-        let m = _mm512_cmpneq_epi8_mask(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let e = _mm256_reduce_add_epi8(a);
+        assert_eq!(32, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_add_epi8() {
+        let a = _mm256_set1_epi8(1);
+        let e = _mm256_mask_reduce_add_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let e = _mm_reduce_add_epi8(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_add_epi8() {
+        let a = _mm_set1_epi8(1);
+        let e = _mm_mask_reduce_add_epi8(0b11111111_00000000, a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_and_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_reduce_and_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_and_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_and_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_and_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_reduce_and_epi16(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_and_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_and_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_and_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
         );
+        let e = _mm256_reduce_and_epi8(a);
+        assert_eq!(0, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmpneq_epi8_mask() {
-        let a = _mm512_set1_epi8(1);
-        let b = _mm512_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_and_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
         );
+        let e = _mm256_mask_reduce_and_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmpneq_epi8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(-1);
-        let m = _mm256_cmpneq_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    unsafe fn test_mm_reduce_and_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_reduce_and_epi8(a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmpneq_epi8_mask() {
-        let a = _mm256_set1_epi8(1);
-        let b = _mm256_set1_epi8(-1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    unsafe fn test_mm_mask_reduce_and_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_and_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmpneq_epi8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(-1);
-        let m = _mm_cmpneq_epi8_mask(a, b);
-        assert_eq!(m, 0b11111111_11111111);
+    unsafe fn test_mm256_reduce_mul_epi16() {
+        let a = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e = _mm256_reduce_mul_epi16(a);
+        assert_eq!(256, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmpneq_epi8_mask() {
-        let a = _mm_set1_epi8(1);
-        let b = _mm_set1_epi8(-1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmpneq_epi8_mask(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
+    unsafe fn test_mm256_mask_reduce_mul_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_mul_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epu16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_mul_epi16() {
+        let a = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1);
+        let e = _mm_reduce_mul_epi16(a);
+        assert_eq!(16, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_mul_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_mul_epi16(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_mul_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_mul_epi8(a);
+        assert_eq!(64, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_mul_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_mul_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_mul_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
+        let e = _mm_reduce_mul_epi8(a);
+        assert_eq!(8, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_mul_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2);
+        let e = _mm_mask_reduce_mul_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_reduce_max_epi16(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_mask_reduce_max_epi16(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_reduce_max_epi16(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_mask_reduce_max_epi16(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_reduce_max_epi8(a);
+        assert_eq!(31, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: i8 = _mm256_mask_reduce_max_epi8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_reduce_max_epi8(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_mask_reduce_max_epi8(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_reduce_max_epu16(a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_mask_reduce_max_epu16(0b11111111_00000000, a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_reduce_max_epu16(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_mask_reduce_max_epu16(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_max_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_reduce_max_epu8(a);
+        assert_eq!(31, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_max_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_mask_reduce_max_epu8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(15, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_reduce_max_epu8(a);
+        assert_eq!(15, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epu16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_max_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_mask_reduce_max_epu8(0b11111111_00000000, a);
+        assert_eq!(7, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epu16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
+    unsafe fn test_mm256_reduce_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_reduce_min_epi16(a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epu16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
+    unsafe fn test_mm256_mask_reduce_min_epi16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i16 = _mm256_mask_reduce_min_epi16(0b11111111_00000000, a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epu16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111);
+    unsafe fn test_mm_reduce_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_reduce_min_epi16(a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epu16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101);
+    unsafe fn test_mm_mask_reduce_min_epi16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i16 = _mm_mask_reduce_min_epi16(0b11110000, a);
+        assert_eq!(0, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epu8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_min_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
         );
+        let e: i8 = _mm256_reduce_min_epi8(a);
+        assert_eq!(0, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epu8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_min_epi8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
         );
+        let e: i8 = _mm256_mask_reduce_min_epi8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epu8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    unsafe fn test_mm_reduce_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_reduce_min_epi8(a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epu8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    unsafe fn test_mm_mask_reduce_min_epi8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: i8 = _mm_mask_reduce_min_epi8(0b11111111_00000000, a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epu8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
+    unsafe fn test_mm256_reduce_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_reduce_min_epu16(a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epu8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
+    unsafe fn test_mm256_mask_reduce_min_epu16() {
+        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u16 = _mm256_mask_reduce_min_epu16(0b11111111_00000000, a);
+        assert_eq!(0, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epi16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_reduce_min_epu16(a);
+        assert_eq!(0, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epi16_mask() {
-        let a = _mm512_set1_epi16(0);
-        let b = _mm512_set1_epi16(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_min_epu16() {
+        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16 = _mm_mask_reduce_min_epu16(0b11110000, a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epi16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
+    unsafe fn test_mm256_reduce_min_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_reduce_min_epu8(a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epi16_mask() {
-        let a = _mm256_set1_epi16(0);
-        let b = _mm256_set1_epi16(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
+    unsafe fn test_mm256_mask_reduce_min_epu8() {
+        let a = _mm256_set_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        );
+        let e: u8 = _mm256_mask_reduce_min_epu8(0b1111111111111111_0000000000000000, a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epi16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111);
+    unsafe fn test_mm_reduce_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_reduce_min_epu8(a);
+        assert_eq!(0, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epi16_mask() {
-        let a = _mm_set1_epi16(0);
-        let b = _mm_set1_epi16(1);
-        let mask = 0b01010101;
-        let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101);
+    unsafe fn test_mm_mask_reduce_min_epu8() {
+        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8 = _mm_mask_reduce_min_epu8(0b11111111_00000000, a);
+        assert_eq!(0, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_cmp_epi8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(
-            m,
-            0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111
-        );
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_reduce_or_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_reduce_or_epi16(a);
+        assert_eq!(3, e);
     }
 
-    #[simd_test(enable = "avx512bw")]
-    unsafe fn test_mm512_mask_cmp_epi8_mask() {
-        let a = _mm512_set1_epi8(0);
-        let b = _mm512_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(
-            r,
-            0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101
-        );
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm256_mask_reduce_or_epi16() {
+        let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm256_mask_reduce_or_epi16(0b11111111_00000000, a);
+        assert_eq!(1, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_cmp_epi8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111_11111111_11111111);
+    unsafe fn test_mm_reduce_or_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_reduce_or_epi16(a);
+        assert_eq!(3, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm256_mask_cmp_epi8_mask() {
-        let a = _mm256_set1_epi8(0);
-        let b = _mm256_set1_epi8(1);
-        let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101_01010101_01010101);
+    unsafe fn test_mm_mask_reduce_or_epi16() {
+        let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_or_epi16(0b11110000, a);
+        assert_eq!(1, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_cmp_epi8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
-        assert_eq!(m, 0b11111111_11111111);
+    unsafe fn test_mm256_reduce_or_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_reduce_or_epi8(a);
+        assert_eq!(3, e);
     }
 
     #[simd_test(enable = "avx512bw,avx512vl")]
-    unsafe fn test_mm_mask_cmp_epi8_mask() {
-        let a = _mm_set1_epi8(0);
-        let b = _mm_set1_epi8(1);
-        let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
-        assert_eq!(r, 0b01010101_01010101);
+    unsafe fn test_mm256_mask_reduce_or_epi8() {
+        let a = _mm256_set_epi8(
+            1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+            2, 2, 2,
+        );
+        let e = _mm256_mask_reduce_or_epi8(0b11111111_00000000_11111111_00000000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_reduce_or_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_reduce_or_epi8(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512bw,avx512vl")]
+    unsafe fn test_mm_mask_reduce_or_epi8() {
+        let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
+        let e = _mm_mask_reduce_or_epi8(0b11111111_00000000, a);
+        assert_eq!(1, e);
     }
 
     #[simd_test(enable = "avx512bw")]
@@ -18679,6 +20114,22 @@ mod tests {
         assert_eq_m128i(r, e);
     }
 
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtmask32_u32() {
+        let a: __mmask32 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtmask32_u32(a);
+        let e: u32 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtu32_mask32() {
+        let a: u32 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtu32_mask32(a);
+        let e: __mmask32 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_kadd_mask32() {
         let a: __mmask32 = 11;
@@ -18820,6 +20271,160 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortest_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask32_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortest_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let mut all_ones: u8 = 0;
+        let r = _kortest_mask64_u8(a, b, &mut all_ones);
+        assert_eq!(r, 0);
+        assert_eq!(all_ones, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestc_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let r = _kortestc_mask32_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestc_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let r = _kortestc_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestz_mask32_u8() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let b: __mmask32 = 0b1011011010110110_1011011010110110;
+        let r = _kortestz_mask32_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kortestz_mask64_u8() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let b: __mmask64 = 0b1011011010110110_1011011010110110;
+        let r = _kortestz_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftli_mask32() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftli_mask32::<3>(a);
+        let e: __mmask32 = 0b0100101101001011_0100101101001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftli_mask64() {
+        let a: __mmask64 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftli_mask64::<3>(a);
+        let e: __mmask64 = 0b0110100101101001011_0100101101001000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftri_mask32() {
+        let a: __mmask32 = 0b0110100101101001_0110100101101001;
+        let r = _kshiftri_mask32::<3>(a);
+        let e: __mmask32 = 0b0000110100101101_0010110100101101;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_kshiftri_mask64() {
+        let a: __mmask64 = 0b0110100101101001011_0100101101001000;
+        let r = _kshiftri_mask64::<3>(a);
+        let e: __mmask64 = 0b0110100101101001_0110100101101001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktest_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask32_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestc_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let r = _ktestc_mask32_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestz_mask32_u8() {
+        let a: __mmask32 = 0b0110100100111100_0110100100111100;
+        let b: __mmask32 = 0b1001011011000011_1001011011000011;
+        let r = _ktestz_mask32_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktest_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let mut and_not: u8 = 0;
+        let r = _ktest_mask64_u8(a, b, &mut and_not);
+        assert_eq!(r, 1);
+        assert_eq!(and_not, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestc_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let r = _ktestc_mask64_u8(a, b);
+        assert_eq!(r, 0);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_ktestz_mask64_u8() {
+        let a: __mmask64 = 0b0110100100111100_0110100100111100;
+        let b: __mmask64 = 0b1001011011000011_1001011011000011;
+        let r = _ktestz_mask64_u8(a, b);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_kunpackw() {
+        let a: u32 = 0x00110011;
+        let b: u32 = 0x00001011;
+        let r = _mm512_kunpackw(a, b);
+        let e: u32 = 0x00111011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_mm512_kunpackd() {
+        let a: u64 = 0x11001100_00110011;
+        let b: u64 = 0x00101110_00001011;
+        let r = _mm512_kunpackd(a, b);
+        let e: u64 = 0x00110011_00001011;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_cvtepi16_epi8() {
         let a = _mm512_set1_epi16(2);
diff --git a/crates/core_arch/src/x86_64/avx512bw.rs b/crates/core_arch/src/x86_64/avx512bw.rs
new file mode 100644
index 0000000000..798fc4adf6
--- /dev/null
+++ b/crates/core_arch/src/x86_64/avx512bw.rs
@@ -0,0 +1,45 @@
+use crate::core_arch::x86::*;
+
+/// Convert 64-bit mask a into an integer value, and store the result in dst.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask64_u64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _cvtmask64_u64(a: __mmask64) -> u64 {
+    a
+}
+
+/// Convert integer value a into an 64-bit mask, and store the result in k.
+///
+/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu64_mask64)
+#[inline]
+#[target_feature(enable = "avx512bw")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _cvtu64_mask64(a: u64) -> __mmask64 {
+    a
+}
+
+#[cfg(test)]
+mod tests {
+
+    use stdarch_test::simd_test;
+
+    use crate::core_arch::{x86::*, x86_64::*};
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtmask64_u64() {
+        let a: __mmask64 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtmask64_u64(a);
+        let e: u64 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512bw")]
+    unsafe fn test_cvtu64_mask64() {
+        let a: u64 = 0b11001100_00110011_01100110_10011001;
+        let r = _cvtu64_mask64(a);
+        let e: __mmask64 = 0b11001100_00110011_01100110_10011001;
+        assert_eq!(r, e);
+    }
+}
diff --git a/crates/core_arch/src/x86_64/mod.rs b/crates/core_arch/src/x86_64/mod.rs
index ff46373d90..fb7bce6871 100644
--- a/crates/core_arch/src/x86_64/mod.rs
+++ b/crates/core_arch/src/x86_64/mod.rs
@@ -46,6 +46,10 @@ mod avx512f;
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub use self::avx512f::*;
 
+mod avx512bw;
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub use self::avx512bw::*;
+
 mod bswap;
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub use self::bswap::*;