diff --git a/crates/core_arch/missing-x86.md b/crates/core_arch/missing-x86.md index 16f6c58cbb..0916befe04 100644 --- a/crates/core_arch/missing-x86.md +++ b/crates/core_arch/missing-x86.md @@ -51,102 +51,6 @@

-
["AVX512BW"]

- - * [ ] [`_cvtmask32_u32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask32_u32) - * [ ] [`_cvtmask64_u64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtmask64_u64) - * [ ] [`_cvtu32_mask32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtu32_mask32) - * [ ] [`_cvtu64_mask64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cvtu64_mask64) - * [ ] [`_kortest_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortest_mask32_u8) - * [ ] [`_kortest_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortest_mask64_u8) - * [ ] [`_kortestc_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestc_mask32_u8) - * [ ] [`_kortestc_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestc_mask64_u8) - * [ ] [`_kortestz_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestz_mask32_u8) - * [ ] [`_kortestz_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kortestz_mask64_u8) - * [ ] [`_kshiftli_mask32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftli_mask32) - * [ ] [`_kshiftli_mask64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftli_mask64) - * [ ] [`_kshiftri_mask32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftri_mask32) - * [ ] [`_kshiftri_mask64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_kshiftri_mask64) - * [ ] [`_ktest_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktest_mask32_u8) - * [ ] [`_ktest_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktest_mask64_u8) - * [ ] [`_ktestc_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestc_mask32_u8) - * [ ] [`_ktestc_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestc_mask64_u8) - * [ ] [`_ktestz_mask32_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestz_mask32_u8) - * [ ] [`_ktestz_mask64_u8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_ktestz_mask64_u8) - * [ ] [`_mm512_kunpackd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kunpackd) - * [ ] [`_mm512_kunpackw`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kunpackw) -

- - -
["AVX512BW", "AVX512VL"]

- - * [ ] [`_mm256_mask_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_add_epi16) - * [ ] [`_mm256_mask_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_add_epi8) - * [ ] [`_mm256_mask_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_and_epi16) - * [ ] [`_mm256_mask_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_and_epi8) - * [ ] [`_mm256_mask_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epi16) - * [ ] [`_mm256_mask_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epi8) - * [ ] [`_mm256_mask_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epu16) - * [ ] [`_mm256_mask_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_max_epu8) - * [ ] [`_mm256_mask_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epi16) - * [ ] [`_mm256_mask_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epi8) - * [ ] [`_mm256_mask_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epu16) - * [ ] [`_mm256_mask_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_min_epu8) - * [ ] [`_mm256_mask_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_mul_epi16) - * [ ] [`_mm256_mask_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_mul_epi8) - * [ ] [`_mm256_mask_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_or_epi16) - * [ ] [`_mm256_mask_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_or_epi8) - * [ ] [`_mm256_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_epi16) - * [ ] [`_mm256_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_epi8) - * [ ] [`_mm256_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_and_epi16) - * [ ] [`_mm256_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_and_epi8) - * [ ] [`_mm256_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epi16) - * [ ] [`_mm256_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epi8) - * [ ] [`_mm256_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epu16) - * [ ] [`_mm256_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_epu8) - * [ ] [`_mm256_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epi16) - * [ ] [`_mm256_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epi8) - * [ ] [`_mm256_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epu16) - * [ ] [`_mm256_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_epu8) - * [ ] [`_mm256_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_epi16) - * [ ] [`_mm256_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_epi8) - * [ ] [`_mm256_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_or_epi16) - * [ ] [`_mm256_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_or_epi8) - * [ ] [`_mm_mask_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_add_epi16) - * [ ] [`_mm_mask_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_add_epi8) - * [ ] [`_mm_mask_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_and_epi16) - * [ ] [`_mm_mask_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_and_epi8) - * [ ] [`_mm_mask_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epi16) - * [ ] [`_mm_mask_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epi8) - * [ ] [`_mm_mask_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epu16) - * [ ] [`_mm_mask_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_max_epu8) - * [ ] [`_mm_mask_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epi16) - * [ ] [`_mm_mask_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epi8) - * [ ] [`_mm_mask_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epu16) - * [ ] [`_mm_mask_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_min_epu8) - * [ ] [`_mm_mask_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_mul_epi16) - * [ ] [`_mm_mask_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_mul_epi8) - * [ ] [`_mm_mask_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_or_epi16) - * [ ] [`_mm_mask_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_or_epi8) - * [ ] [`_mm_reduce_add_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_epi16) - * [ ] [`_mm_reduce_add_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_epi8) - * [ ] [`_mm_reduce_and_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_and_epi16) - * [ ] [`_mm_reduce_and_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_and_epi8) - * [ ] [`_mm_reduce_max_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epi16) - * [ ] [`_mm_reduce_max_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epi8) - * [ ] [`_mm_reduce_max_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epu16) - * [ ] [`_mm_reduce_max_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_epu8) - * [ ] [`_mm_reduce_min_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epi16) - * [ ] [`_mm_reduce_min_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epi8) - * [ ] [`_mm_reduce_min_epu16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epu16) - * [ ] [`_mm_reduce_min_epu8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_epu8) - * [ ] [`_mm_reduce_mul_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_epi16) - * [ ] [`_mm_reduce_mul_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_epi8) - * [ ] [`_mm_reduce_or_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_or_epi16) - * [ ] [`_mm_reduce_or_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_or_epi8) -

- -
["AVX512_FP16"]

* [ ] [`_mm256_castpd_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph) diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 91fef37895..4c637f49f3 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -743,3 +743,142 @@ simd_ty!( x6, x7 ); + +// 1024-bit wide types: +simd_ty!( + u16x64[u16]: + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + x9, + x10, + x11, + x12, + x13, + x14, + x15, + x16, + x17, + x18, + x19, + x20, + x21, + x22, + x23, + x24, + x25, + x26, + x27, + x28, + x29, + x30, + x31, + x32, + x33, + x34, + x35, + x36, + x37, + x38, + x39, + x40, + x41, + x42, + x43, + x44, + x45, + x46, + x47, + x48, + x49, + x50, + x51, + x52, + x53, + x54, + x55, + x56, + x57, + x58, + x59, + x60, + x61, + x62, + x63 +); +simd_ty!( + i32x32[i32]: + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + x9, + x10, + x11, + x12, + x13, + x14, + x15, + x16, + x17, + x18, + x19, + x20, + x21, + x22, + x23, + x24, + x25, + x26, + x27, + x28, + x29, + x30, + x31 +); +simd_ty!( + u32x32[u32]: + x0, + x1, + x2, + x3, + x4, + x5, + x6, + x7, + x8, + x9, + x10, + x11, + x12, + x13, + x14, + x15, + x16, + x17, + x18, + x19, + x20, + x21, + x22, + x23, + x24, + x25, + x26, + x27, + x28, + x29, + x30, + x31 +); diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index dd74d11786..66f6ee1259 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -2,7 +2,7 @@ use crate::{ arch::asm, core_arch::{simd::*, x86::*}, intrinsics::simd::*, - mem, ptr, + ptr, }; #[cfg(test)] @@ -17,11 +17,8 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(vpabsw))] pub unsafe fn _mm512_abs_epi16(a: __m512i) -> __m512i { let a = a.as_i16x32(); - // all-0 is a properly initialized i16x32 - let zero: i16x32 = mem::zeroed(); - let sub = simd_sub(zero, a); - let cmp: i16x32 = simd_gt(a, zero); - transmute(simd_select(cmp, a, sub)) + let cmp: i16x32 = simd_gt(a, i16x32::splat(0)); + transmute(simd_select(cmp, a, simd_neg(a))) } /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -108,11 +105,8 @@ pub unsafe fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpabsb))] pub unsafe fn _mm512_abs_epi8(a: __m512i) -> __m512i { let a = a.as_i8x64(); - // all-0 is a properly initialized i8x64 - let zero: i8x64 = mem::zeroed(); - let sub = simd_sub(zero, a); - let cmp: i8x64 = simd_gt(a, zero); - transmute(simd_select(cmp, a, sub)) + let cmp: i8x64 = simd_gt(a, i8x64::splat(0)); + transmute(simd_select(cmp, a, simd_neg(a))) } /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -370,12 +364,7 @@ pub unsafe fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusw))] pub unsafe fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddusw( - a.as_u16x32(), - b.as_u16x32(), - _mm512_setzero_si512().as_u16x32(), - 0b11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_add(a.as_u16x32(), b.as_u16x32())) } /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -391,7 +380,8 @@ pub unsafe fn _mm512_mask_adds_epu16( a: __m512i, b: __m512i, ) -> __m512i { - transmute(vpaddusw(a.as_u16x32(), b.as_u16x32(), src.as_u16x32(), k)) + let add = _mm512_adds_epu16(a, b).as_u16x32(); + transmute(simd_select_bitmask(k, add, src.as_u16x32())) } /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -402,12 +392,8 @@ pub unsafe fn _mm512_mask_adds_epu16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusw))] pub unsafe fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddusw( - a.as_u16x32(), - b.as_u16x32(), - _mm512_setzero_si512().as_u16x32(), - k, - )) + let add = _mm512_adds_epu16(a, b).as_u16x32(); + transmute(simd_select_bitmask(k, add, u16x32::splat(0))) } /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -423,12 +409,8 @@ pub unsafe fn _mm256_mask_adds_epu16( a: __m256i, b: __m256i, ) -> __m256i { - transmute(vpaddusw256( - a.as_u16x16(), - b.as_u16x16(), - src.as_u16x16(), - k, - )) + let add = _mm256_adds_epu16(a, b).as_u16x16(); + transmute(simd_select_bitmask(k, add, src.as_u16x16())) } /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -439,12 +421,8 @@ pub unsafe fn _mm256_mask_adds_epu16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusw))] pub unsafe fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i { - transmute(vpaddusw256( - a.as_u16x16(), - b.as_u16x16(), - _mm256_setzero_si256().as_u16x16(), - k, - )) + let add = _mm256_adds_epu16(a, b).as_u16x16(); + transmute(simd_select_bitmask(k, add, u16x16::splat(0))) } /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -455,7 +433,8 @@ pub unsafe fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusw))] pub unsafe fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddusw128(a.as_u16x8(), b.as_u16x8(), src.as_u16x8(), k)) + let add = _mm_adds_epu16(a, b).as_u16x8(); + transmute(simd_select_bitmask(k, add, src.as_u16x8())) } /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -466,12 +445,8 @@ pub unsafe fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusw))] pub unsafe fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddusw128( - a.as_u16x8(), - b.as_u16x8(), - _mm_setzero_si128().as_u16x8(), - k, - )) + let add = _mm_adds_epu16(a, b).as_u16x8(); + transmute(simd_select_bitmask(k, add, u16x8::splat(0))) } /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst. @@ -482,12 +457,7 @@ pub unsafe fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddusb( - a.as_u8x64(), - b.as_u8x64(), - _mm512_setzero_si512().as_u8x64(), - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_add(a.as_u8x64(), b.as_u8x64())) } /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -498,7 +468,8 @@ pub unsafe fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddusb(a.as_u8x64(), b.as_u8x64(), src.as_u8x64(), k)) + let add = _mm512_adds_epu8(a, b).as_u8x64(); + transmute(simd_select_bitmask(k, add, src.as_u8x64())) } /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -509,12 +480,8 @@ pub unsafe fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddusb( - a.as_u8x64(), - b.as_u8x64(), - _mm512_setzero_si512().as_u8x64(), - k, - )) + let add = _mm512_adds_epu8(a, b).as_u8x64(); + transmute(simd_select_bitmask(k, add, u8x64::splat(0))) } /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -525,7 +492,8 @@ pub unsafe fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpaddusb256(a.as_u8x32(), b.as_u8x32(), src.as_u8x32(), k)) + let add = _mm256_adds_epu8(a, b).as_u8x32(); + transmute(simd_select_bitmask(k, add, src.as_u8x32())) } /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -536,12 +504,8 @@ pub unsafe fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpaddusb256( - a.as_u8x32(), - b.as_u8x32(), - _mm256_setzero_si256().as_u8x32(), - k, - )) + let add = _mm256_adds_epu8(a, b).as_u8x32(); + transmute(simd_select_bitmask(k, add, u8x32::splat(0))) } /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -552,7 +516,8 @@ pub unsafe fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddusb128(a.as_u8x16(), b.as_u8x16(), src.as_u8x16(), k)) + let add = _mm_adds_epu8(a, b).as_u8x16(); + transmute(simd_select_bitmask(k, add, src.as_u8x16())) } /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -563,12 +528,8 @@ pub unsafe fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddusb))] pub unsafe fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddusb128( - a.as_u8x16(), - b.as_u8x16(), - _mm_setzero_si128().as_u8x16(), - k, - )) + let add = _mm_adds_epu8(a, b).as_u8x16(); + transmute(simd_select_bitmask(k, add, u8x16::splat(0))) } /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst. @@ -579,12 +540,7 @@ pub unsafe fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsw))] pub unsafe fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddsw( - a.as_i16x32(), - b.as_i16x32(), - _mm512_setzero_si512().as_i16x32(), - 0b11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_add(a.as_i16x32(), b.as_i16x32())) } /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -600,7 +556,8 @@ pub unsafe fn _mm512_mask_adds_epi16( a: __m512i, b: __m512i, ) -> __m512i { - transmute(vpaddsw(a.as_i16x32(), b.as_i16x32(), src.as_i16x32(), k)) + let add = _mm512_adds_epi16(a, b).as_i16x32(); + transmute(simd_select_bitmask(k, add, src.as_i16x32())) } /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -611,12 +568,8 @@ pub unsafe fn _mm512_mask_adds_epi16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsw))] pub unsafe fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddsw( - a.as_i16x32(), - b.as_i16x32(), - _mm512_setzero_si512().as_i16x32(), - k, - )) + let add = _mm512_adds_epi16(a, b).as_i16x32(); + transmute(simd_select_bitmask(k, add, i16x32::splat(0))) } /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -632,7 +585,8 @@ pub unsafe fn _mm256_mask_adds_epi16( a: __m256i, b: __m256i, ) -> __m256i { - transmute(vpaddsw256(a.as_i16x16(), b.as_i16x16(), src.as_i16x16(), k)) + let add = _mm256_adds_epi16(a, b).as_i16x16(); + transmute(simd_select_bitmask(k, add, src.as_i16x16())) } /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -643,12 +597,8 @@ pub unsafe fn _mm256_mask_adds_epi16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsw))] pub unsafe fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i { - transmute(vpaddsw256( - a.as_i16x16(), - b.as_i16x16(), - _mm256_setzero_si256().as_i16x16(), - k, - )) + let add = _mm256_adds_epi16(a, b).as_i16x16(); + transmute(simd_select_bitmask(k, add, i16x16::splat(0))) } /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -659,7 +609,8 @@ pub unsafe fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsw))] pub unsafe fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddsw128(a.as_i16x8(), b.as_i16x8(), src.as_i16x8(), k)) + let add = _mm_adds_epi16(a, b).as_i16x8(); + transmute(simd_select_bitmask(k, add, src.as_i16x8())) } /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -670,12 +621,8 @@ pub unsafe fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsw))] pub unsafe fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddsw128( - a.as_i16x8(), - b.as_i16x8(), - _mm_setzero_si128().as_i16x8(), - k, - )) + let add = _mm_adds_epi16(a, b).as_i16x8(); + transmute(simd_select_bitmask(k, add, i16x8::splat(0))) } /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst. @@ -686,12 +633,7 @@ pub unsafe fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddsb( - a.as_i8x64(), - b.as_i8x64(), - _mm512_setzero_si512().as_i8x64(), - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_add(a.as_i8x64(), b.as_i8x64())) } /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -702,7 +644,8 @@ pub unsafe fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddsb(a.as_i8x64(), b.as_i8x64(), src.as_i8x64(), k)) + let add = _mm512_adds_epi8(a, b).as_i8x64(); + transmute(simd_select_bitmask(k, add, src.as_i8x64())) } /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -713,12 +656,8 @@ pub unsafe fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpaddsb( - a.as_i8x64(), - b.as_i8x64(), - _mm512_setzero_si512().as_i8x64(), - k, - )) + let add = _mm512_adds_epi8(a, b).as_i8x64(); + transmute(simd_select_bitmask(k, add, i8x64::splat(0))) } /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -729,7 +668,8 @@ pub unsafe fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpaddsb256(a.as_i8x32(), b.as_i8x32(), src.as_i8x32(), k)) + let add = _mm256_adds_epi8(a, b).as_i8x32(); + transmute(simd_select_bitmask(k, add, src.as_i8x32())) } /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -740,12 +680,8 @@ pub unsafe fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpaddsb256( - a.as_i8x32(), - b.as_i8x32(), - _mm256_setzero_si256().as_i8x32(), - k, - )) + let add = _mm256_adds_epi8(a, b).as_i8x32(); + transmute(simd_select_bitmask(k, add, i8x32::splat(0))) } /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -756,7 +692,8 @@ pub unsafe fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddsb128(a.as_i8x16(), b.as_i8x16(), src.as_i8x16(), k)) + let add = _mm_adds_epi8(a, b).as_i8x16(); + transmute(simd_select_bitmask(k, add, src.as_i8x16())) } /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -767,12 +704,8 @@ pub unsafe fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpaddsb))] pub unsafe fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpaddsb128( - a.as_i8x16(), - b.as_i8x16(), - _mm_setzero_si128().as_i8x16(), - k, - )) + let add = _mm_adds_epi8(a, b).as_i8x16(); + transmute(simd_select_bitmask(k, add, i8x16::splat(0))) } /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst. @@ -955,12 +888,7 @@ pub unsafe fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusw))] pub unsafe fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubusw( - a.as_u16x32(), - b.as_u16x32(), - _mm512_setzero_si512().as_u16x32(), - 0b11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_sub(a.as_u16x32(), b.as_u16x32())) } /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -976,7 +904,8 @@ pub unsafe fn _mm512_mask_subs_epu16( a: __m512i, b: __m512i, ) -> __m512i { - transmute(vpsubusw(a.as_u16x32(), b.as_u16x32(), src.as_u16x32(), k)) + let sub = _mm512_subs_epu16(a, b).as_u16x32(); + transmute(simd_select_bitmask(k, sub, src.as_u16x32())) } /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -987,12 +916,8 @@ pub unsafe fn _mm512_mask_subs_epu16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusw))] pub unsafe fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubusw( - a.as_u16x32(), - b.as_u16x32(), - _mm512_setzero_si512().as_u16x32(), - k, - )) + let sub = _mm512_subs_epu16(a, b).as_u16x32(); + transmute(simd_select_bitmask(k, sub, u16x32::splat(0))) } /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1008,12 +933,8 @@ pub unsafe fn _mm256_mask_subs_epu16( a: __m256i, b: __m256i, ) -> __m256i { - transmute(vpsubusw256( - a.as_u16x16(), - b.as_u16x16(), - src.as_u16x16(), - k, - )) + let sub = _mm256_subs_epu16(a, b).as_u16x16(); + transmute(simd_select_bitmask(k, sub, src.as_u16x16())) } /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1024,12 +945,8 @@ pub unsafe fn _mm256_mask_subs_epu16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusw))] pub unsafe fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i { - transmute(vpsubusw256( - a.as_u16x16(), - b.as_u16x16(), - _mm256_setzero_si256().as_u16x16(), - k, - )) + let sub = _mm256_subs_epu16(a, b).as_u16x16(); + transmute(simd_select_bitmask(k, sub, u16x16::splat(0))) } /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1040,7 +957,8 @@ pub unsafe fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusw))] pub unsafe fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubusw128(a.as_u16x8(), b.as_u16x8(), src.as_u16x8(), k)) + let sub = _mm_subs_epu16(a, b).as_u16x8(); + transmute(simd_select_bitmask(k, sub, src.as_u16x8())) } /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1051,12 +969,8 @@ pub unsafe fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusw))] pub unsafe fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubusw128( - a.as_u16x8(), - b.as_u16x8(), - _mm_setzero_si128().as_u16x8(), - k, - )) + let sub = _mm_subs_epu16(a, b).as_u16x8(); + transmute(simd_select_bitmask(k, sub, u16x8::splat(0))) } /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst. @@ -1067,12 +981,7 @@ pub unsafe fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubusb( - a.as_u8x64(), - b.as_u8x64(), - _mm512_setzero_si512().as_u8x64(), - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_sub(a.as_u8x64(), b.as_u8x64())) } /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1083,7 +992,8 @@ pub unsafe fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubusb(a.as_u8x64(), b.as_u8x64(), src.as_u8x64(), k)) + let sub = _mm512_subs_epu8(a, b).as_u8x64(); + transmute(simd_select_bitmask(k, sub, src.as_u8x64())) } /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1094,12 +1004,8 @@ pub unsafe fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubusb( - a.as_u8x64(), - b.as_u8x64(), - _mm512_setzero_si512().as_u8x64(), - k, - )) + let sub = _mm512_subs_epu8(a, b).as_u8x64(); + transmute(simd_select_bitmask(k, sub, u8x64::splat(0))) } /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1110,7 +1016,8 @@ pub unsafe fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpsubusb256(a.as_u8x32(), b.as_u8x32(), src.as_u8x32(), k)) + let sub = _mm256_subs_epu8(a, b).as_u8x32(); + transmute(simd_select_bitmask(k, sub, src.as_u8x32())) } /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1121,12 +1028,8 @@ pub unsafe fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpsubusb256( - a.as_u8x32(), - b.as_u8x32(), - _mm256_setzero_si256().as_u8x32(), - k, - )) + let sub = _mm256_subs_epu8(a, b).as_u8x32(); + transmute(simd_select_bitmask(k, sub, u8x32::splat(0))) } /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1137,7 +1040,8 @@ pub unsafe fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubusb128(a.as_u8x16(), b.as_u8x16(), src.as_u8x16(), k)) + let sub = _mm_subs_epu8(a, b).as_u8x16(); + transmute(simd_select_bitmask(k, sub, src.as_u8x16())) } /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1148,12 +1052,8 @@ pub unsafe fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubusb))] pub unsafe fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubusb128( - a.as_u8x16(), - b.as_u8x16(), - _mm_setzero_si128().as_u8x16(), - k, - )) + let sub = _mm_subs_epu8(a, b).as_u8x16(); + transmute(simd_select_bitmask(k, sub, u8x16::splat(0))) } /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst. @@ -1164,12 +1064,7 @@ pub unsafe fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsw))] pub unsafe fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubsw( - a.as_i16x32(), - b.as_i16x32(), - _mm512_setzero_si512().as_i16x32(), - 0b11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_sub(a.as_i16x32(), b.as_i16x32())) } /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1185,7 +1080,8 @@ pub unsafe fn _mm512_mask_subs_epi16( a: __m512i, b: __m512i, ) -> __m512i { - transmute(vpsubsw(a.as_i16x32(), b.as_i16x32(), src.as_i16x32(), k)) + let sub = _mm512_subs_epi16(a, b).as_i16x32(); + transmute(simd_select_bitmask(k, sub, src.as_i16x32())) } /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1196,12 +1092,8 @@ pub unsafe fn _mm512_mask_subs_epi16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsw))] pub unsafe fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubsw( - a.as_i16x32(), - b.as_i16x32(), - _mm512_setzero_si512().as_i16x32(), - k, - )) + let sub = _mm512_subs_epi16(a, b).as_i16x32(); + transmute(simd_select_bitmask(k, sub, i16x32::splat(0))) } /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1217,7 +1109,8 @@ pub unsafe fn _mm256_mask_subs_epi16( a: __m256i, b: __m256i, ) -> __m256i { - transmute(vpsubsw256(a.as_i16x16(), b.as_i16x16(), src.as_i16x16(), k)) + let sub = _mm256_subs_epi16(a, b).as_i16x16(); + transmute(simd_select_bitmask(k, sub, src.as_i16x16())) } /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1228,12 +1121,8 @@ pub unsafe fn _mm256_mask_subs_epi16( #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsw))] pub unsafe fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i { - transmute(vpsubsw256( - a.as_i16x16(), - b.as_i16x16(), - _mm256_setzero_si256().as_i16x16(), - k, - )) + let sub = _mm256_subs_epi16(a, b).as_i16x16(); + transmute(simd_select_bitmask(k, sub, i16x16::splat(0))) } /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1244,7 +1133,8 @@ pub unsafe fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsw))] pub unsafe fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubsw128(a.as_i16x8(), b.as_i16x8(), src.as_i16x8(), k)) + let sub = _mm_subs_epi16(a, b).as_i16x8(); + transmute(simd_select_bitmask(k, sub, src.as_i16x8())) } /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1255,12 +1145,8 @@ pub unsafe fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsw))] pub unsafe fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubsw128( - a.as_i16x8(), - b.as_i16x8(), - _mm_setzero_si128().as_i16x8(), - k, - )) + let sub = _mm_subs_epi16(a, b).as_i16x8(); + transmute(simd_select_bitmask(k, sub, i16x8::splat(0))) } /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst. @@ -1271,12 +1157,7 @@ pub unsafe fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubsb( - a.as_i8x64(), - b.as_i8x64(), - _mm512_setzero_si512().as_i8x64(), - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - )) + transmute(simd_saturating_sub(a.as_i8x64(), b.as_i8x64())) } /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1287,7 +1168,8 @@ pub unsafe fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubsb(a.as_i8x64(), b.as_i8x64(), src.as_i8x64(), k)) + let sub = _mm512_subs_epi8(a, b).as_i8x64(); + transmute(simd_select_bitmask(k, sub, src.as_i8x64())) } /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1298,12 +1180,8 @@ pub unsafe fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i { - transmute(vpsubsb( - a.as_i8x64(), - b.as_i8x64(), - _mm512_setzero_si512().as_i8x64(), - k, - )) + let sub = _mm512_subs_epi8(a, b).as_i8x64(); + transmute(simd_select_bitmask(k, sub, i8x64::splat(0))) } /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1314,7 +1192,8 @@ pub unsafe fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpsubsb256(a.as_i8x32(), b.as_i8x32(), src.as_i8x32(), k)) + let sub = _mm256_subs_epi8(a, b).as_i8x32(); + transmute(simd_select_bitmask(k, sub, src.as_i8x32())) } /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1325,12 +1204,8 @@ pub unsafe fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: _ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i { - transmute(vpsubsb256( - a.as_i8x32(), - b.as_i8x32(), - _mm256_setzero_si256().as_i8x32(), - k, - )) + let sub = _mm256_subs_epi8(a, b).as_i8x32(); + transmute(simd_select_bitmask(k, sub, i8x32::splat(0))) } /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1341,7 +1216,8 @@ pub unsafe fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubsb128(a.as_i8x16(), b.as_i8x16(), src.as_i8x16(), k)) + let sub = _mm_subs_epi8(a, b).as_i8x16(); + transmute(simd_select_bitmask(k, sub, src.as_i8x16())) } /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). @@ -1352,12 +1228,8 @@ pub unsafe fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpsubsb))] pub unsafe fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i { - transmute(vpsubsb128( - a.as_i8x16(), - b.as_i8x16(), - _mm_setzero_si128().as_i8x16(), - k, - )) + let sub = _mm_subs_epi8(a, b).as_i8x16(); + transmute(simd_select_bitmask(k, sub, i8x16::splat(0))) } /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst. @@ -1368,7 +1240,10 @@ pub unsafe fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmulhuw))] pub unsafe fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmulhuw(a.as_u16x32(), b.as_u16x32())) + let a = simd_cast::<_, u32x32>(a.as_u16x32()); + let b = simd_cast::<_, u32x32>(b.as_u16x32()); + let r = simd_shr(simd_mul(a, b), u32x32::splat(16)); + transmute(simd_cast::(r)) } /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1464,7 +1339,10 @@ pub unsafe fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmulhw))] pub unsafe fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmulhw(a.as_i16x32(), b.as_i16x32())) + let a = simd_cast::<_, i32x32>(a.as_i16x32()); + let b = simd_cast::<_, i32x32>(b.as_i16x32()); + let r = simd_shr(simd_mul(a, b), i32x32::splat(16)); + transmute(simd_cast::(r)) } /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1752,7 +1630,9 @@ pub unsafe fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m1 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxuw))] pub unsafe fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxuw(a.as_u16x32(), b.as_u16x32())) + let a = a.as_u16x32(); + let b = b.as_u16x32(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1838,7 +1718,9 @@ pub unsafe fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxub))] pub unsafe fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxub(a.as_u8x64(), b.as_u8x64())) + let a = a.as_u8x64(); + let b = b.as_u8x64(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -1924,7 +1806,9 @@ pub unsafe fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxsw))] pub unsafe fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxsw(a.as_i16x32(), b.as_i16x32())) + let a = a.as_i16x32(); + let b = b.as_i16x32(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2010,7 +1894,9 @@ pub unsafe fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpmaxsb))] pub unsafe fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpmaxsb(a.as_i8x64(), b.as_i8x64())) + let a = a.as_i8x64(); + let b = b.as_i8x64(); + transmute(simd_select::(simd_gt(a, b), a, b)) } /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2096,7 +1982,9 @@ pub unsafe fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminuw))] pub unsafe fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminuw(a.as_u16x32(), b.as_u16x32())) + let a = a.as_u16x32(); + let b = b.as_u16x32(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2182,7 +2070,9 @@ pub unsafe fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminub))] pub unsafe fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminub(a.as_u8x64(), b.as_u8x64())) + let a = a.as_u8x64(); + let b = b.as_u8x64(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2268,7 +2158,9 @@ pub unsafe fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminsw))] pub unsafe fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminsw(a.as_i16x32(), b.as_i16x32())) + let a = a.as_i16x32(); + let b = b.as_i16x32(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2354,7 +2246,9 @@ pub unsafe fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpminsb))] pub unsafe fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpminsb(a.as_i8x64(), b.as_i8x64())) + let a = a.as_i8x64(); + let b = b.as_i8x64(); + transmute(simd_select::(simd_lt(a, b), a, b)) } /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -2451,7 +2345,7 @@ pub unsafe fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmplt_epu16_mask(a, b) & k1 + _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2473,7 +2367,7 @@ pub unsafe fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmplt_epu16_mask(a, b) & k1 + _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2495,7 +2389,7 @@ pub unsafe fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmplt_epu16_mask(a, b) & k1 + _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2517,7 +2411,7 @@ pub unsafe fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmplt_epu8_mask(a, b) & k1 + _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2539,7 +2433,7 @@ pub unsafe fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmplt_epu8_mask(a, b) & k1 + _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2561,7 +2455,7 @@ pub unsafe fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmplt_epu8_mask(a, b) & k1 + _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2583,7 +2477,7 @@ pub unsafe fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmplt_epi16_mask(a, b) & k1 + _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2605,7 +2499,7 @@ pub unsafe fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmplt_epi16_mask(a, b) & k1 + _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2627,7 +2521,7 @@ pub unsafe fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmplt_epi16_mask(a, b) & k1 + _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2649,7 +2543,7 @@ pub unsafe fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmplt_epi8_mask(a, b) & k1 + _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2671,7 +2565,7 @@ pub unsafe fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmplt_epi8_mask(a, b) & k1 + _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k. @@ -2693,7 +2587,7 @@ pub unsafe fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmplt_epi8_mask(a, b) & k1 + _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2715,7 +2609,7 @@ pub unsafe fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpgt_epu16_mask(a, b) & k1 + _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2737,7 +2631,7 @@ pub unsafe fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpgt_epu16_mask(a, b) & k1 + _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2759,7 +2653,7 @@ pub unsafe fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpgt_epu16_mask(a, b) & k1 + _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2781,7 +2675,7 @@ pub unsafe fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpgt_epu8_mask(a, b) & k1 + _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2803,7 +2697,7 @@ pub unsafe fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpgt_epu8_mask(a, b) & k1 + _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2825,7 +2719,7 @@ pub unsafe fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpgt_epu8_mask(a, b) & k1 + _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2847,7 +2741,7 @@ pub unsafe fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpgt_epi16_mask(a, b) & k1 + _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2869,7 +2763,7 @@ pub unsafe fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpgt_epi16_mask(a, b) & k1 + _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2891,7 +2785,7 @@ pub unsafe fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpgt_epi16_mask(a, b) & k1 + _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2913,7 +2807,7 @@ pub unsafe fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpgt_epi8_mask(a, b) & k1 + _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2935,7 +2829,7 @@ pub unsafe fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpgt_epi8_mask(a, b) & k1 + _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k. @@ -2957,7 +2851,7 @@ pub unsafe fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpgt_epi8_mask(a, b) & k1 + _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -2979,7 +2873,7 @@ pub unsafe fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmple_epu16_mask(a, b) & k1 + _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3001,7 +2895,7 @@ pub unsafe fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmple_epu16_mask(a, b) & k1 + _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3023,7 +2917,7 @@ pub unsafe fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmple_epu16_mask(a, b) & k1 + _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3045,7 +2939,7 @@ pub unsafe fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmple_epu8_mask(a, b) & k1 + _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3067,7 +2961,7 @@ pub unsafe fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmple_epu8_mask(a, b) & k1 + _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3089,7 +2983,7 @@ pub unsafe fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmple_epu8_mask(a, b) & k1 + _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3111,7 +3005,7 @@ pub unsafe fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmple_epi16_mask(a, b) & k1 + _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3133,7 +3027,7 @@ pub unsafe fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmple_epi16_mask(a, b) & k1 + _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3155,7 +3049,7 @@ pub unsafe fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmple_epi16_mask(a, b) & k1 + _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3177,7 +3071,7 @@ pub unsafe fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmple_epi8_mask(a, b) & k1 + _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3199,7 +3093,7 @@ pub unsafe fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmple_epi8_mask(a, b) & k1 + _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k. @@ -3221,7 +3115,7 @@ pub unsafe fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmple_epi8_mask(a, b) & k1 + _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3243,7 +3137,7 @@ pub unsafe fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpge_epu16_mask(a, b) & k1 + _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3265,7 +3159,7 @@ pub unsafe fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpge_epu16_mask(a, b) & k1 + _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3287,7 +3181,7 @@ pub unsafe fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpge_epu16_mask(a, b) & k1 + _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3309,7 +3203,7 @@ pub unsafe fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpge_epu8_mask(a, b) & k1 + _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3331,7 +3225,7 @@ pub unsafe fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpge_epu8_mask(a, b) & k1 + _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3353,7 +3247,7 @@ pub unsafe fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpge_epu8_mask(a, b) & k1 + _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3375,7 +3269,7 @@ pub unsafe fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpge_epi16_mask(a, b) & k1 + _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3397,7 +3291,7 @@ pub unsafe fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpge_epi16_mask(a, b) & k1 + _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3419,7 +3313,7 @@ pub unsafe fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpge_epi16_mask(a, b) & k1 + _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3441,7 +3335,7 @@ pub unsafe fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpge_epi8_mask(a, b) & k1 + _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3463,7 +3357,7 @@ pub unsafe fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpge_epi8_mask(a, b) & k1 + _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k. @@ -3485,7 +3379,7 @@ pub unsafe fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpge_epi8_mask(a, b) & k1 + _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k. @@ -3507,7 +3401,7 @@ pub unsafe fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpeq_epu16_mask(a, b) & k1 + _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k. @@ -3529,7 +3423,7 @@ pub unsafe fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpeq_epu16_mask(a, b) & k1 + _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k. @@ -3551,7 +3445,7 @@ pub unsafe fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpeq_epu16_mask(a, b) & k1 + _mm_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k. @@ -3573,7 +3467,7 @@ pub unsafe fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpeq_epu8_mask(a, b) & k1 + _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k. @@ -3595,7 +3489,7 @@ pub unsafe fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpeq_epu8_mask(a, b) & k1 + _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k. @@ -3617,7 +3511,7 @@ pub unsafe fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpeq_epu8_mask(a, b) & k1 + _mm_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k. @@ -3639,7 +3533,7 @@ pub unsafe fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpeq_epi16_mask(a, b) & k1 + _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k. @@ -3661,7 +3555,7 @@ pub unsafe fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpeq_epi16_mask(a, b) & k1 + _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k. @@ -3683,7 +3577,7 @@ pub unsafe fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpeq_epi16_mask(a, b) & k1 + _mm_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k. @@ -3705,7 +3599,7 @@ pub unsafe fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpeq_epi8_mask(a, b) & k1 + _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k. @@ -3727,7 +3621,7 @@ pub unsafe fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpeq_epi8_mask(a, b) & k1 + _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k. @@ -3749,7 +3643,7 @@ pub unsafe fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpeq_epi8_mask(a, b) & k1 + _mm_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3771,7 +3665,7 @@ pub unsafe fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpneq_epu16_mask(a, b) & k1 + _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3793,7 +3687,7 @@ pub unsafe fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpneq_epu16_mask(a, b) & k1 + _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3815,7 +3709,7 @@ pub unsafe fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpneq_epu16_mask(a, b) & k1 + _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3837,7 +3731,7 @@ pub unsafe fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpneq_epu8_mask(a, b) & k1 + _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3859,7 +3753,7 @@ pub unsafe fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpneq_epu8_mask(a, b) & k1 + _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3881,7 +3775,7 @@ pub unsafe fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpneq_epu8_mask(a, b) & k1 + _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3903,7 +3797,7 @@ pub unsafe fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 { - _mm512_cmpneq_epi16_mask(a, b) & k1 + _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3925,7 +3819,7 @@ pub unsafe fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 { - _mm256_cmpneq_epi16_mask(a, b) & k1 + _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3947,7 +3841,7 @@ pub unsafe fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 { - _mm_cmpneq_epi16_mask(a, b) & k1 + _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3969,7 +3863,7 @@ pub unsafe fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 { - _mm512_cmpneq_epi8_mask(a, b) & k1 + _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -3991,7 +3885,7 @@ pub unsafe fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 { - _mm256_cmpneq_epi8_mask(a, b) & k1 + _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k. @@ -4013,7 +3907,7 @@ pub unsafe fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 { #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 { - _mm_cmpneq_epi8_mask(a, b) & k1 + _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by `IMM8`, and store the results in mask vector k. @@ -4028,7 +3922,17 @@ pub unsafe fn _mm512_cmp_epu16_mask(a: __m512i, b: __m512i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x32(); let b = b.as_u16x32(); - vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i16x32::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i16x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4047,7 +3951,18 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x32(); let b = b.as_u16x32(); - vpcmpuw(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i16x32::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i16x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4062,7 +3977,17 @@ pub unsafe fn _mm256_cmp_epu16_mask(a: __m256i, b: __m256i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x16(); let b = b.as_u16x16(); - vpcmpuw256(a, b, IMM8, 0b11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i16x16::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i16x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4081,7 +4006,18 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x16(); let b = b.as_u16x16(); - vpcmpuw256(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i16x16::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i16x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4096,7 +4032,17 @@ pub unsafe fn _mm_cmp_epu16_mask(a: __m128i, b: __m128i) -> __m static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x8(); let b = b.as_u16x8(); - vpcmpuw128(a, b, IMM8, 0b11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i16x8::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i16x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4115,7 +4061,18 @@ pub unsafe fn _mm_mask_cmp_epu16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u16x8(); let b = b.as_u16x8(); - vpcmpuw128(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i16x8::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i16x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4130,12 +4087,17 @@ pub unsafe fn _mm512_cmp_epu8_mask(a: __m512i, b: __m512i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x64(); let b = b.as_u8x64(); - vpcmpub( - a, - b, - IMM8, - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - ) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i8x64::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i8x64::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4154,7 +4116,18 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x64(); let b = b.as_u8x64(); - vpcmpub(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i8x64::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i8x64::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4169,7 +4142,17 @@ pub unsafe fn _mm256_cmp_epu8_mask(a: __m256i, b: __m256i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x32(); let b = b.as_u8x32(); - vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i8x32::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i8x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4188,7 +4171,18 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x32(); let b = b.as_u8x32(); - vpcmpub256(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i8x32::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i8x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4203,7 +4197,17 @@ pub unsafe fn _mm_cmp_epu8_mask(a: __m128i, b: __m128i) -> __mm static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x16(); let b = b.as_u8x16(); - vpcmpub128(a, b, IMM8, 0b11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i8x16::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i8x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4222,7 +4226,18 @@ pub unsafe fn _mm_mask_cmp_epu8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_u8x16(); let b = b.as_u8x16(); - vpcmpub128(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i8x16::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i8x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4237,7 +4252,17 @@ pub unsafe fn _mm512_cmp_epi16_mask(a: __m512i, b: __m512i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x32(); let b = b.as_i16x32(); - vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i16x32::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i16x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4256,7 +4281,18 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x32(); let b = b.as_i16x32(); - vpcmpw(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i16x32::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i16x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4271,7 +4307,17 @@ pub unsafe fn _mm256_cmp_epi16_mask(a: __m256i, b: __m256i) -> static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x16(); let b = b.as_i16x16(); - vpcmpw256(a, b, IMM8, 0b11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i16x16::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i16x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4290,7 +4336,18 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x16(); let b = b.as_i16x16(); - vpcmpw256(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i16x16::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i16x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4305,7 +4362,17 @@ pub unsafe fn _mm_cmp_epi16_mask(a: __m128i, b: __m128i) -> __m static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x8(); let b = b.as_i16x8(); - vpcmpw128(a, b, IMM8, 0b11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i16x8::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i16x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4324,7 +4391,18 @@ pub unsafe fn _mm_mask_cmp_epi16_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i16x8(); let b = b.as_i16x8(); - vpcmpw128(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i16x8::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i16x8::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4339,12 +4417,17 @@ pub unsafe fn _mm512_cmp_epi8_mask(a: __m512i, b: __m512i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x64(); let b = b.as_i8x64(); - vpcmpb( - a, - b, - IMM8, - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111, - ) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i8x64::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i8x64::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4363,7 +4446,18 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x64(); let b = b.as_i8x64(); - vpcmpb(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i8x64::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i8x64::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4378,7 +4472,17 @@ pub unsafe fn _mm256_cmp_epi8_mask(a: __m256i, b: __m256i) -> _ static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x32(); let b = b.as_i8x32(); - vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i8x32::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i8x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4397,7 +4501,18 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x32(); let b = b.as_i8x32(); - vpcmpb256(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i8x32::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i8x32::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k. @@ -4412,7 +4527,17 @@ pub unsafe fn _mm_cmp_epi8_mask(a: __m128i, b: __m128i) -> __mm static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x16(); let b = b.as_i8x16(); - vpcmpb128(a, b, IMM8, 0b11111111_11111111) + let r = match IMM8 { + 0 => simd_eq(a, b), + 1 => simd_lt(a, b), + 2 => simd_le(a, b), + 3 => i8x16::splat(0), + 4 => simd_ne(a, b), + 5 => simd_ge(a, b), + 6 => simd_gt(a, b), + _ => i8x16::splat(-1), + }; + simd_bitmask(r) } /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set). @@ -4431,139 +4556,838 @@ pub unsafe fn _mm_mask_cmp_epi8_mask( static_assert_uimm_bits!(IMM8, 3); let a = a.as_i8x16(); let b = b.as_i8x16(); - vpcmpb128(a, b, IMM8, k1) + let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::splat(0)); + let r = match IMM8 { + 0 => simd_and(k1, simd_eq(a, b)), + 1 => simd_and(k1, simd_lt(a, b)), + 2 => simd_and(k1, simd_le(a, b)), + 3 => i8x16::splat(0), + 4 => simd_and(k1, simd_ne(a, b)), + 5 => simd_and(k1, simd_ge(a, b)), + 6 => simd_and(k1, simd_gt(a, b)), + _ => i8x16::splat(-1), + }; + simd_bitmask(r) } -/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi16&expand=3368) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi16) #[inline] -#[target_feature(enable = "avx512bw")] +#[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 -pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i { - ptr::read_unaligned(mem_addr as *const __m512i) +pub unsafe fn _mm256_reduce_add_epi16(a: __m256i) -> i16 { + simd_reduce_add_unordered(a.as_i16x16()) } -/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi16&expand=3365) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi16) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 -pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i { - ptr::read_unaligned(mem_addr as *const __m256i) +pub unsafe fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 { + simd_reduce_add_unordered(simd_select_bitmask( + k, + a.as_i16x16(), + _mm256_setzero_si256().as_i16x16(), + )) } -/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi16&expand=3362) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi16) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 -pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i { - ptr::read_unaligned(mem_addr as *const __m128i) +pub unsafe fn _mm_reduce_add_epi16(a: __m128i) -> i16 { + simd_reduce_add_unordered(a.as_i16x8()) } -/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi8&expand=3395) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi16) #[inline] -#[target_feature(enable = "avx512bw")] +#[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 -pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i { - ptr::read_unaligned(mem_addr as *const __m512i) +pub unsafe fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 { + simd_reduce_add_unordered(simd_select_bitmask( + k, + a.as_i16x8(), + _mm_setzero_si128().as_i16x8(), + )) } -/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi8&expand=3392) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_add_epi8) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 -pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i { - ptr::read_unaligned(mem_addr as *const __m256i) +pub unsafe fn _mm256_reduce_add_epi8(a: __m256i) -> i8 { + simd_reduce_add_unordered(a.as_i8x32()) } -/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi8&expand=3389) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_add_epi8) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 -pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i { - ptr::read_unaligned(mem_addr as *const __m128i) +pub unsafe fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 { + simd_reduce_add_unordered(simd_select_bitmask( + k, + a.as_i8x32(), + _mm256_setzero_si256().as_i8x32(), + )) } -/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi16&expand=5622) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_add_epi8) #[inline] -#[target_feature(enable = "avx512bw")] +#[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 -pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) { - ptr::write_unaligned(mem_addr as *mut __m512i, a); +pub unsafe fn _mm_reduce_add_epi8(a: __m128i) -> i8 { + simd_reduce_add_unordered(a.as_i8x16()) } -/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi16&expand=5620) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_add_epi8) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 -pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) { - ptr::write_unaligned(mem_addr as *mut __m256i, a); +pub unsafe fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 { + simd_reduce_add_unordered(simd_select_bitmask( + k, + a.as_i8x16(), + _mm_setzero_si128().as_i8x16(), + )) } -/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi16&expand=5618) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi16) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 -pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) { - ptr::write_unaligned(mem_addr as *mut __m128i, a); +pub unsafe fn _mm256_reduce_and_epi16(a: __m256i) -> i16 { + simd_reduce_and(a.as_i16x16()) } -/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi8&expand=5640) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi16) #[inline] -#[target_feature(enable = "avx512bw")] +#[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 -pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) { - ptr::write_unaligned(mem_addr as *mut __m512i, a); +pub unsafe fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 { + simd_reduce_and(simd_select_bitmask( + k, + a.as_i16x16(), + _mm256_set1_epi64x(-1).as_i16x16(), + )) } -/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi8&expand=5638) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi16) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 -pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) { - ptr::write_unaligned(mem_addr as *mut __m256i, a); +pub unsafe fn _mm_reduce_and_epi16(a: __m128i) -> i16 { + simd_reduce_and(a.as_i16x8()) } -/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi8&expand=5636) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi16) #[inline] #[target_feature(enable = "avx512bw,avx512vl")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 -pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) { - ptr::write_unaligned(mem_addr as *mut __m128i, a); +pub unsafe fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 { + simd_reduce_and(simd_select_bitmask( + k, + a.as_i16x8(), + _mm_set1_epi64x(-1).as_i16x8(), + )) +} + +/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_and_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_and_epi8(a: __m256i) -> i8 { + simd_reduce_and(a.as_i8x32()) +} + +/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_and_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 { + simd_reduce_and(simd_select_bitmask( + k, + a.as_i8x32(), + _mm256_set1_epi64x(-1).as_i8x32(), + )) +} + +/// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_and_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_and_epi8(a: __m128i) -> i8 { + simd_reduce_and(a.as_i8x16()) +} + +/// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_and_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 { + simd_reduce_and(simd_select_bitmask( + k, + a.as_i8x16(), + _mm_set1_epi64x(-1).as_i8x16(), + )) +} + +/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_max_epi16(a: __m256i) -> i16 { + simd_reduce_max(a.as_i16x16()) +} + +/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 { + simd_reduce_max(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(-32768))) +} + +/// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_max_epi16(a: __m128i) -> i16 { + simd_reduce_max(a.as_i16x8()) +} + +/// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 { + simd_reduce_max(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(-32768))) +} + +/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_max_epi8(a: __m256i) -> i8 { + simd_reduce_max(a.as_i8x32()) +} + +/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 { + simd_reduce_max(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(-128))) +} + +/// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_max_epi8(a: __m128i) -> i8 { + simd_reduce_max(a.as_i8x16()) +} + +/// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 { + simd_reduce_max(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(-128))) +} + +/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_max_epu16(a: __m256i) -> u16 { + simd_reduce_max(a.as_u16x16()) +} + +/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 { + simd_reduce_max(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0))) +} + +/// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_max_epu16(a: __m128i) -> u16 { + simd_reduce_max(a.as_u16x8()) +} + +/// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 { + simd_reduce_max(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0))) +} + +/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_max_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_max_epu8(a: __m256i) -> u8 { + simd_reduce_max(a.as_u8x32()) +} + +/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_max_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 { + simd_reduce_max(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0))) +} + +/// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_max_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_max_epu8(a: __m128i) -> u8 { + simd_reduce_max(a.as_u8x16()) +} + +/// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_max_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 { + simd_reduce_max(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0))) +} + +/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_min_epi16(a: __m256i) -> i16 { + simd_reduce_min(a.as_i16x16()) +} + +/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 { + simd_reduce_min(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(0x7fff))) +} + +/// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_min_epi16(a: __m128i) -> i16 { + simd_reduce_min(a.as_i16x8()) +} + +/// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 { + simd_reduce_min(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(0x7fff))) +} + +/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_min_epi8(a: __m256i) -> i8 { + simd_reduce_min(a.as_i8x32()) +} + +/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 { + simd_reduce_min(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(0x7f))) +} + +/// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_min_epi8(a: __m128i) -> i8 { + simd_reduce_min(a.as_i8x16()) +} + +/// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 { + simd_reduce_min(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(0x7f))) +} + +/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_min_epu16(a: __m256i) -> u16 { + simd_reduce_min(a.as_u16x16()) +} + +/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 { + simd_reduce_min(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0xffff))) +} + +/// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_min_epu16(a: __m128i) -> u16 { + simd_reduce_min(a.as_u16x8()) +} + +/// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 { + simd_reduce_min(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0xffff))) +} + +/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_min_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_min_epu8(a: __m256i) -> u8 { + simd_reduce_min(a.as_u8x32()) +} + +/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_min_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 { + simd_reduce_min(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0xff))) +} + +/// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_min_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_min_epu8(a: __m128i) -> u8 { + simd_reduce_min(a.as_u8x16()) +} + +/// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_min_epu8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 { + simd_reduce_min(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0xff))) +} + +/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 { + simd_reduce_mul_unordered(a.as_i16x16()) +} + +/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 { + simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1))) +} + +/// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_mul_epi16(a: __m128i) -> i16 { + simd_reduce_mul_unordered(a.as_i16x8()) +} + +/// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 { + simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1))) +} + +/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_mul_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 { + simd_reduce_mul_unordered(a.as_i8x32()) +} + +/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_mul_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 { + simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1))) +} + +/// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_mul_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_mul_epi8(a: __m128i) -> i8 { + simd_reduce_mul_unordered(a.as_i8x16()) +} + +/// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_mul_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 { + simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1))) +} + +/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_or_epi16(a: __m256i) -> i16 { + simd_reduce_or(a.as_i16x16()) +} + +/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 { + simd_reduce_or(simd_select_bitmask( + k, + a.as_i16x16(), + _mm256_setzero_si256().as_i16x16(), + )) +} + +/// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_or_epi16(a: __m128i) -> i16 { + simd_reduce_or(a.as_i16x8()) +} + +/// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi16) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 { + simd_reduce_or(simd_select_bitmask( + k, + a.as_i16x8(), + _mm_setzero_si128().as_i16x8(), + )) +} + +/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_reduce_or_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_reduce_or_epi8(a: __m256i) -> i8 { + simd_reduce_or(a.as_i8x32()) +} + +/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_reduce_or_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 { + simd_reduce_or(simd_select_bitmask( + k, + a.as_i8x32(), + _mm256_setzero_si256().as_i8x32(), + )) +} + +/// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_reduce_or_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_reduce_or_epi8(a: __m128i) -> i8 { + simd_reduce_or(a.as_i8x16()) +} + +/// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_reduce_or_epi8) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_mask_reduce_or_epi8(k: __mmask16, a: __m128i) -> i8 { + simd_reduce_or(simd_select_bitmask( + k, + a.as_i8x16(), + _mm_setzero_si128().as_i8x16(), + )) +} + +/// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi16&expand=3368) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 +pub unsafe fn _mm512_loadu_epi16(mem_addr: *const i16) -> __m512i { + ptr::read_unaligned(mem_addr as *const __m512i) +} + +/// Load 256-bits (composed of 16 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi16&expand=3365) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 +pub unsafe fn _mm256_loadu_epi16(mem_addr: *const i16) -> __m256i { + ptr::read_unaligned(mem_addr as *const __m256i) +} + +/// Load 128-bits (composed of 8 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi16&expand=3362) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 +pub unsafe fn _mm_loadu_epi16(mem_addr: *const i16) -> __m128i { + ptr::read_unaligned(mem_addr as *const __m128i) +} + +/// Load 512-bits (composed of 64 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_loadu_epi8&expand=3395) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 +pub unsafe fn _mm512_loadu_epi8(mem_addr: *const i8) -> __m512i { + ptr::read_unaligned(mem_addr as *const __m512i) +} + +/// Load 256-bits (composed of 32 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_epi8&expand=3392) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 +pub unsafe fn _mm256_loadu_epi8(mem_addr: *const i8) -> __m256i { + ptr::read_unaligned(mem_addr as *const __m256i) +} + +/// Load 128-bits (composed of 16 packed 8-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_epi8&expand=3389) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 +pub unsafe fn _mm_loadu_epi8(mem_addr: *const i8) -> __m128i { + ptr::read_unaligned(mem_addr as *const __m128i) +} + +/// Store 512-bits (composed of 32 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi16&expand=5622) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 +pub unsafe fn _mm512_storeu_epi16(mem_addr: *mut i16, a: __m512i) { + ptr::write_unaligned(mem_addr as *mut __m512i, a); +} + +/// Store 256-bits (composed of 16 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi16&expand=5620) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 +pub unsafe fn _mm256_storeu_epi16(mem_addr: *mut i16, a: __m256i) { + ptr::write_unaligned(mem_addr as *mut __m256i, a); +} + +/// Store 128-bits (composed of 8 packed 16-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi16&expand=5618) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu16 +pub unsafe fn _mm_storeu_epi16(mem_addr: *mut i16, a: __m128i) { + ptr::write_unaligned(mem_addr as *mut __m128i, a); +} + +/// Store 512-bits (composed of 64 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_storeu_epi8&expand=5640) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 +pub unsafe fn _mm512_storeu_epi8(mem_addr: *mut i8, a: __m512i) { + ptr::write_unaligned(mem_addr as *mut __m512i, a); +} + +/// Store 256-bits (composed of 32 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_epi8&expand=5638) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 +pub unsafe fn _mm256_storeu_epi8(mem_addr: *mut i8, a: __m256i) { + ptr::write_unaligned(mem_addr as *mut __m256i, a); +} + +/// Store 128-bits (composed of 16 packed 8-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_epi8&expand=5636) +#[inline] +#[target_feature(enable = "avx512bw,avx512vl")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu8 +pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) { + ptr::write_unaligned(mem_addr as *mut __m128i, a); } /// Load packed 16-bit integers from memory into dst using writemask k @@ -5505,7 +6329,10 @@ pub unsafe fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpavgw))] pub unsafe fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i { - transmute(vpavgw(a.as_u16x32(), b.as_u16x32())) + let a = simd_cast::<_, u32x32>(a.as_u16x32()); + let b = simd_cast::<_, u32x32>(b.as_u16x32()); + let r = simd_shr(simd_add(simd_add(a, b), u32x32::splat(1)), u32x32::splat(1)); + transmute(simd_cast::<_, u16x32>(r)) } /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -5591,7 +6418,10 @@ pub unsafe fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] #[cfg_attr(test, assert_instr(vpavgb))] pub unsafe fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i { - transmute(vpavgb(a.as_u8x64(), b.as_u8x64())) + let a = simd_cast::<_, u16x64>(a.as_u8x64()); + let b = simd_cast::<_, u16x64>(b.as_u8x64()); + let r = simd_shr(simd_add(simd_add(a, b), u16x64::splat(1)), u16x64::splat(1)); + transmute(simd_cast::<_, u8x64>(r)) } /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -9221,6 +10051,26 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i { transmute(simd_select_bitmask(k, one, zero)) } +/// Convert 32-bit mask a into an integer value, and store the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#_cvtmask32_u32) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _cvtmask32_u32(a: __mmask32) -> u32 { + a +} + +/// Convert integer value a into an 32-bit mask, and store the result in k. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu32_mask32) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _cvtu32_mask32(a: u32) -> __mmask32 { + a +} + /// Add 32-bit masks in a and b, and store the result in k. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kadd_mask32&expand=3207) @@ -9257,108 +10107,314 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - a & b +pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { + a & b +} + +/// Compute the bitwise NOT of 32-bit mask a, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask32&expand=3234) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 { + !a +} + +/// Compute the bitwise NOT of 64-bit mask a, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask64&expand=3235) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 { + !a +} + +/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask32&expand=3219) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { + _knot_mask32(a) & b +} + +/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask64&expand=3220) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { + _knot_mask64(a) & b +} + +/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask32&expand=3240) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { + a | b +} + +/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask64&expand=3241) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { + a | b +} + +/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask32&expand=3292) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { + a ^ b +} + +/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask64&expand=3293) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { + a ^ b +} + +/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask32&expand=3286) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { + _knot_mask32(a ^ b) +} + +/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask64&expand=3287) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { + _knot_mask64(a ^ b) +} + +/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask32_u8) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortest_mask32_u8(a: __mmask32, b: __mmask32, all_ones: *mut u8) -> u8 { + let tmp = _kor_mask32(a, b); + *all_ones = (tmp == 0xffffffff) as u8; + (tmp == 0) as u8 +} + +/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. If the result is all ones, store 1 in all_ones, otherwise store 0 in all_ones. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortest_mask64_u8) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortest_mask64_u8(a: __mmask64, b: __mmask64, all_ones: *mut u8) -> u8 { + let tmp = _kor_mask64(a, b); + *all_ones = (tmp == 0xffffffff_ffffffff) as u8; + (tmp == 0) as u8 +} + +/// Compute the bitwise OR of 32-bit masks a and b. If the result is all ones, store 1 in dst, otherwise +/// store 0 in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask32_u8) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 { + (_kor_mask32(a, b) == 0xffffffff) as u8 +} + +/// Compute the bitwise OR of 64-bit masks a and b. If the result is all ones, store 1 in dst, otherwise +/// store 0 in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestc_mask64_u8) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 { + (_kor_mask64(a, b) == 0xffffffff_ffffffff) as u8 +} + +/// Compute the bitwise OR of 32-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask32_u8) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 { + (_kor_mask32(a, b) == 0) as u8 +} + +/// Compute the bitwise OR of 64-bit masks a and b. If the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kortestz_mask64_u8) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 { + (_kor_mask64(a, b) == 0) as u8 +} + +/// Shift the bits of 32-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask32) +#[inline] +#[target_feature(enable = "avx512bw")] +#[rustc_legacy_const_generics(1)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kshiftli_mask32(a: __mmask32) -> __mmask32 { + a << COUNT +} + +/// Shift the bits of 64-bit mask a left by count while shifting in zeros, and store the least significant 32 bits of the result in k. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftli_mask64) +#[inline] +#[target_feature(enable = "avx512bw")] +#[rustc_legacy_const_generics(1)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _kshiftli_mask64(a: __mmask64) -> __mmask64 { + a << COUNT } -/// Compute the bitwise NOT of 32-bit mask a, and store the result in k. +/// Shift the bits of 32-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask32&expand=3234) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask32) #[inline] #[target_feature(enable = "avx512bw")] +#[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 { - a ^ 0b11111111_11111111_11111111_11111111 +pub unsafe fn _kshiftri_mask32(a: __mmask32) -> __mmask32 { + a >> COUNT } -/// Compute the bitwise NOT of 64-bit mask a, and store the result in k. +/// Shift the bits of 64-bit mask a right by count while shifting in zeros, and store the least significant 32 bits of the result in k. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_knot_mask64&expand=3235) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kshiftri_mask64) #[inline] #[target_feature(enable = "avx512bw")] +#[rustc_legacy_const_generics(1)] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 { - a ^ 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 +pub unsafe fn _kshiftri_mask64(a: __mmask64) -> __mmask64 { + a >> COUNT } -/// Compute the bitwise NOT of 32-bit masks a and then AND with b, and store the result in k. +/// Compute the bitwise AND of 32-bit masks a and b, and if the result is all zeros, store 1 in dst, +/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all +/// zeros, store 1 in and_not, otherwise store 0 in and_not. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask32&expand=3219) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask32_u8) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - _knot_mask32(a) & b +pub unsafe fn _ktest_mask32_u8(a: __mmask32, b: __mmask32, and_not: *mut u8) -> u8 { + *and_not = (_kandn_mask32(a, b) == 0) as u8; + (_kand_mask32(a, b) == 0) as u8 } -/// Compute the bitwise NOT of 64-bit masks a and then AND with b, and store the result in k. +/// Compute the bitwise AND of 64-bit masks a and b, and if the result is all zeros, store 1 in dst, +/// otherwise store 0 in dst. Compute the bitwise NOT of a and then AND with b, if the result is all +/// zeros, store 1 in and_not, otherwise store 0 in and_not. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kandn_mask64&expand=3220) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktest_mask64_u8) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - _knot_mask64(a) & b +pub unsafe fn _ktest_mask64_u8(a: __mmask64, b: __mmask64, and_not: *mut u8) -> u8 { + *and_not = (_kandn_mask64(a, b) == 0) as u8; + (_kand_mask64(a, b) == 0) as u8 } -/// Compute the bitwise OR of 32-bit masks a and b, and store the result in k. +/// Compute the bitwise NOT of 32-bit mask a and then AND with 16-bit mask b, if the result is all +/// zeros, store 1 in dst, otherwise store 0 in dst. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask32&expand=3240) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask32_u8) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - a | b +pub unsafe fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 { + (_kandn_mask32(a, b) == 0) as u8 } -/// Compute the bitwise OR of 64-bit masks a and b, and store the result in k. +/// Compute the bitwise NOT of 64-bit mask a and then AND with 8-bit mask b, if the result is all +/// zeros, store 1 in dst, otherwise store 0 in dst. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kor_mask64&expand=3241) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestc_mask64_u8) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - a | b +pub unsafe fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 { + (_kandn_mask64(a, b) == 0) as u8 } -/// Compute the bitwise XOR of 32-bit masks a and b, and store the result in k. +/// Compute the bitwise AND of 32-bit masks a and b, if the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask32&expand=3292) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask32_u8) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - a ^ b +pub unsafe fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 { + (_kand_mask32(a, b) == 0) as u8 } -/// Compute the bitwise XOR of 64-bit masks a and b, and store the result in k. +/// Compute the bitwise AND of 64-bit masks a and b, if the result is all zeros, store 1 in dst, otherwise +/// store 0 in dst. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxor_mask64&expand=3293) +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_ktestz_mask64_u8) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - a ^ b +pub unsafe fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 { + (_kand_mask64(a, b) == 0) as u8 } -/// Compute the bitwise XNOR of 32-bit masks a and b, and store the result in k. +/// Unpack and interleave 16 bits from masks a and b, and store the 32-bit result in k. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask32&expand=3286) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=512_kunpackw) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 { - _knot_mask32(a ^ b) +#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckwd +pub unsafe fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 { + ((a & 0xffff) << 16) | (b & 0xffff) } -/// Compute the bitwise XNOR of 64-bit masks a and b, and store the result in k. +/// Unpack and interleave 32 bits from masks a and b, and store the 64-bit result in k. /// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_kxnor_mask64&expand=3287) +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=512_kunpackd) #[inline] #[target_feature(enable = "avx512bw")] #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] -pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 { - _knot_mask64(a ^ b) +#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckdq +pub unsafe fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 { + ((a & 0xffffffff) << 32) | (b & 0xffffffff) } /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst. @@ -10589,115 +11645,9 @@ pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.avx512.mask.paddus.w.512"] - fn vpaddusw(a: u16x32, b: u16x32, src: u16x32, mask: u32) -> u16x32; - #[link_name = "llvm.x86.avx512.mask.paddus.w.256"] - fn vpaddusw256(a: u16x16, b: u16x16, src: u16x16, mask: u16) -> u16x16; - #[link_name = "llvm.x86.avx512.mask.paddus.w.128"] - fn vpaddusw128(a: u16x8, b: u16x8, src: u16x8, mask: u8) -> u16x8; - - #[link_name = "llvm.x86.avx512.mask.paddus.b.512"] - fn vpaddusb(a: u8x64, b: u8x64, src: u8x64, mask: u64) -> u8x64; - #[link_name = "llvm.x86.avx512.mask.paddus.b.256"] - fn vpaddusb256(a: u8x32, b: u8x32, src: u8x32, mask: u32) -> u8x32; - #[link_name = "llvm.x86.avx512.mask.paddus.b.128"] - fn vpaddusb128(a: u8x16, b: u8x16, src: u8x16, mask: u16) -> u8x16; - - #[link_name = "llvm.x86.avx512.mask.padds.w.512"] - fn vpaddsw(a: i16x32, b: i16x32, src: i16x32, mask: u32) -> i16x32; - #[link_name = "llvm.x86.avx512.mask.padds.w.256"] - fn vpaddsw256(a: i16x16, b: i16x16, src: i16x16, mask: u16) -> i16x16; - #[link_name = "llvm.x86.avx512.mask.padds.w.128"] - fn vpaddsw128(a: i16x8, b: i16x8, src: i16x8, mask: u8) -> i16x8; - - #[link_name = "llvm.x86.avx512.mask.padds.b.512"] - fn vpaddsb(a: i8x64, b: i8x64, src: i8x64, mask: u64) -> i8x64; - #[link_name = "llvm.x86.avx512.mask.padds.b.256"] - fn vpaddsb256(a: i8x32, b: i8x32, src: i8x32, mask: u32) -> i8x32; - #[link_name = "llvm.x86.avx512.mask.padds.b.128"] - fn vpaddsb128(a: i8x16, b: i8x16, src: i8x16, mask: u16) -> i8x16; - - #[link_name = "llvm.x86.avx512.mask.psubus.w.512"] - fn vpsubusw(a: u16x32, b: u16x32, src: u16x32, mask: u32) -> u16x32; - #[link_name = "llvm.x86.avx512.mask.psubus.w.256"] - fn vpsubusw256(a: u16x16, b: u16x16, src: u16x16, mask: u16) -> u16x16; - #[link_name = "llvm.x86.avx512.mask.psubus.w.128"] - fn vpsubusw128(a: u16x8, b: u16x8, src: u16x8, mask: u8) -> u16x8; - - #[link_name = "llvm.x86.avx512.mask.psubus.b.512"] - fn vpsubusb(a: u8x64, b: u8x64, src: u8x64, mask: u64) -> u8x64; - #[link_name = "llvm.x86.avx512.mask.psubus.b.256"] - fn vpsubusb256(a: u8x32, b: u8x32, src: u8x32, mask: u32) -> u8x32; - #[link_name = "llvm.x86.avx512.mask.psubus.b.128"] - fn vpsubusb128(a: u8x16, b: u8x16, src: u8x16, mask: u16) -> u8x16; - - #[link_name = "llvm.x86.avx512.mask.psubs.w.512"] - fn vpsubsw(a: i16x32, b: i16x32, src: i16x32, mask: u32) -> i16x32; - #[link_name = "llvm.x86.avx512.mask.psubs.w.256"] - fn vpsubsw256(a: i16x16, b: i16x16, src: i16x16, mask: u16) -> i16x16; - #[link_name = "llvm.x86.avx512.mask.psubs.w.128"] - fn vpsubsw128(a: i16x8, b: i16x8, src: i16x8, mask: u8) -> i16x8; - - #[link_name = "llvm.x86.avx512.mask.psubs.b.512"] - fn vpsubsb(a: i8x64, b: i8x64, src: i8x64, mask: u64) -> i8x64; - #[link_name = "llvm.x86.avx512.mask.psubs.b.256"] - fn vpsubsb256(a: i8x32, b: i8x32, src: i8x32, mask: u32) -> i8x32; - #[link_name = "llvm.x86.avx512.mask.psubs.b.128"] - fn vpsubsb128(a: i8x16, b: i8x16, src: i8x16, mask: u16) -> i8x16; - - #[link_name = "llvm.x86.avx512.pmulhu.w.512"] - fn vpmulhuw(a: u16x32, b: u16x32) -> u16x32; - #[link_name = "llvm.x86.avx512.pmulh.w.512"] - fn vpmulhw(a: i16x32, b: i16x32) -> i16x32; #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"] fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.mask.ucmp.w.512"] - fn vpcmpuw(a: u16x32, b: u16x32, op: i32, mask: u32) -> u32; - #[link_name = "llvm.x86.avx512.mask.ucmp.w.256"] - fn vpcmpuw256(a: u16x16, b: u16x16, op: i32, mask: u16) -> u16; - #[link_name = "llvm.x86.avx512.mask.ucmp.w.128"] - fn vpcmpuw128(a: u16x8, b: u16x8, op: i32, mask: u8) -> u8; - - #[link_name = "llvm.x86.avx512.mask.ucmp.b.512"] - fn vpcmpub(a: u8x64, b: u8x64, op: i32, mask: u64) -> u64; - #[link_name = "llvm.x86.avx512.mask.ucmp.b.256"] - fn vpcmpub256(a: u8x32, b: u8x32, op: i32, mask: u32) -> u32; - #[link_name = "llvm.x86.avx512.mask.ucmp.b.128"] - fn vpcmpub128(a: u8x16, b: u8x16, op: i32, mask: u16) -> u16; - - #[link_name = "llvm.x86.avx512.mask.cmp.w.512"] - fn vpcmpw(a: i16x32, b: i16x32, op: i32, mask: u32) -> u32; - #[link_name = "llvm.x86.avx512.mask.cmp.w.256"] - fn vpcmpw256(a: i16x16, b: i16x16, op: i32, mask: u16) -> u16; - #[link_name = "llvm.x86.avx512.mask.cmp.w.128"] - fn vpcmpw128(a: i16x8, b: i16x8, op: i32, mask: u8) -> u8; - - #[link_name = "llvm.x86.avx512.mask.cmp.b.512"] - fn vpcmpb(a: i8x64, b: i8x64, op: i32, mask: u64) -> u64; - #[link_name = "llvm.x86.avx512.mask.cmp.b.256"] - fn vpcmpb256(a: i8x32, b: i8x32, op: i32, mask: u32) -> u32; - #[link_name = "llvm.x86.avx512.mask.cmp.b.128"] - fn vpcmpb128(a: i8x16, b: i8x16, op: i32, mask: u16) -> u16; - - #[link_name = "llvm.x86.avx512.mask.pmaxu.w.512"] - fn vpmaxuw(a: u16x32, b: u16x32) -> u16x32; - #[link_name = "llvm.x86.avx512.mask.pmaxu.b.512"] - fn vpmaxub(a: u8x64, b: u8x64) -> u8x64; - #[link_name = "llvm.x86.avx512.mask.pmaxs.w.512"] - fn vpmaxsw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.mask.pmaxs.b.512"] - fn vpmaxsb(a: i8x64, b: i8x64) -> i8x64; - - #[link_name = "llvm.x86.avx512.mask.pminu.w.512"] - fn vpminuw(a: u16x32, b: u16x32) -> u16x32; - #[link_name = "llvm.x86.avx512.mask.pminu.b.512"] - fn vpminub(a: u8x64, b: u8x64) -> u8x64; - #[link_name = "llvm.x86.avx512.mask.pmins.w.512"] - fn vpminsw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.mask.pmins.b.512"] - fn vpminsb(a: i8x64, b: i8x64) -> i8x64; - #[link_name = "llvm.x86.avx512.pmaddw.d.512"] fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16; #[link_name = "llvm.x86.avx512.pmaddubs.w.512"] @@ -10712,11 +11662,6 @@ extern "C" { #[link_name = "llvm.x86.avx512.packuswb.512"] fn vpackuswb(a: i16x32, b: i16x32) -> u8x64; - #[link_name = "llvm.x86.avx512.pavg.w.512"] - fn vpavgw(a: u16x32, b: u16x32) -> u16x32; - #[link_name = "llvm.x86.avx512.pavg.b.512"] - fn vpavgb(a: u8x64, b: u8x64) -> u8x64; - #[link_name = "llvm.x86.avx512.psll.w.512"] fn vpsllw(a: i16x32, count: i16x8) -> i16x32; @@ -13754,11 +14699,227 @@ mod tests { } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmple_epu8_mask() { - let a = _mm512_set1_epi8(-1); - let b = _mm512_set1_epi8(-1); + unsafe fn test_mm512_mask_cmple_epu8_mask() { + let a = _mm512_set1_epi8(-1); + let b = _mm512_set1_epi8(-1); + let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; + let r = _mm512_mask_cmple_epu8_mask(mask, a, b); + assert_eq!( + r, + 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 + ); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_cmple_epu8_mask() { + let a = _mm256_set1_epi8(-1); + let b = _mm256_set1_epi8(-1); + let m = _mm256_cmple_epu8_mask(a, b); + assert_eq!(m, 0b11111111_11111111_11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_cmple_epu8_mask() { + let a = _mm256_set1_epi8(-1); + let b = _mm256_set1_epi8(-1); + let mask = 0b01010101_01010101_01010101_01010101; + let r = _mm256_mask_cmple_epu8_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101_01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_cmple_epu8_mask() { + let a = _mm_set1_epi8(-1); + let b = _mm_set1_epi8(-1); + let m = _mm_cmple_epu8_mask(a, b); + assert_eq!(m, 0b11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_cmple_epu8_mask() { + let a = _mm_set1_epi8(-1); + let b = _mm_set1_epi8(-1); + let mask = 0b01010101_01010101; + let r = _mm_mask_cmple_epu8_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_cmple_epi16_mask() { + let a = _mm512_set1_epi16(-1); + let b = _mm512_set1_epi16(-1); + let m = _mm512_cmple_epi16_mask(a, b); + assert_eq!(m, 0b11111111_11111111_11111111_11111111); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_mask_cmple_epi16_mask() { + let a = _mm512_set1_epi16(-1); + let b = _mm512_set1_epi16(-1); + let mask = 0b01010101_01010101_01010101_01010101; + let r = _mm512_mask_cmple_epi16_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101_01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_cmple_epi16_mask() { + let a = _mm256_set1_epi16(-1); + let b = _mm256_set1_epi16(-1); + let m = _mm256_cmple_epi16_mask(a, b); + assert_eq!(m, 0b11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_cmple_epi16_mask() { + let a = _mm256_set1_epi16(-1); + let b = _mm256_set1_epi16(-1); + let mask = 0b01010101_01010101; + let r = _mm256_mask_cmple_epi16_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_cmple_epi16_mask() { + let a = _mm_set1_epi16(-1); + let b = _mm_set1_epi16(-1); + let m = _mm_cmple_epi16_mask(a, b); + assert_eq!(m, 0b11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_cmple_epi16_mask() { + let a = _mm_set1_epi16(-1); + let b = _mm_set1_epi16(-1); + let mask = 0b01010101; + let r = _mm_mask_cmple_epi16_mask(mask, a, b); + assert_eq!(r, 0b01010101); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_cmple_epi8_mask() { + let a = _mm512_set1_epi8(-1); + let b = _mm512_set1_epi8(-1); + let m = _mm512_cmple_epi8_mask(a, b); + assert_eq!( + m, + 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 + ); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_mask_cmple_epi8_mask() { + let a = _mm512_set1_epi8(-1); + let b = _mm512_set1_epi8(-1); + let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; + let r = _mm512_mask_cmple_epi8_mask(mask, a, b); + assert_eq!( + r, + 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 + ); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_cmple_epi8_mask() { + let a = _mm256_set1_epi8(-1); + let b = _mm256_set1_epi8(-1); + let m = _mm256_cmple_epi8_mask(a, b); + assert_eq!(m, 0b11111111_11111111_11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_cmple_epi8_mask() { + let a = _mm256_set1_epi8(-1); + let b = _mm256_set1_epi8(-1); + let mask = 0b01010101_01010101_01010101_01010101; + let r = _mm256_mask_cmple_epi8_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101_01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_cmple_epi8_mask() { + let a = _mm_set1_epi8(-1); + let b = _mm_set1_epi8(-1); + let m = _mm_cmple_epi8_mask(a, b); + assert_eq!(m, 0b11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_cmple_epi8_mask() { + let a = _mm_set1_epi8(-1); + let b = _mm_set1_epi8(-1); + let mask = 0b01010101_01010101; + let r = _mm_mask_cmple_epi8_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_cmpge_epu16_mask() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1); + let m = _mm512_cmpge_epu16_mask(a, b); + assert_eq!(m, 0b11111111_11111111_11111111_11111111); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_mask_cmpge_epu16_mask() { + let a = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(1); + let mask = 0b01010101_01010101_01010101_01010101; + let r = _mm512_mask_cmpge_epu16_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101_01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_cmpge_epu16_mask() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1); + let m = _mm256_cmpge_epu16_mask(a, b); + assert_eq!(m, 0b11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_cmpge_epu16_mask() { + let a = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(1); + let mask = 0b01010101_01010101; + let r = _mm256_mask_cmpge_epu16_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_cmpge_epu16_mask() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1); + let m = _mm_cmpge_epu16_mask(a, b); + assert_eq!(m, 0b11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_cmpge_epu16_mask() { + let a = _mm_set1_epi16(1); + let b = _mm_set1_epi16(1); + let mask = 0b01010101; + let r = _mm_mask_cmpge_epu16_mask(mask, a, b); + assert_eq!(r, 0b01010101); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_cmpge_epu8_mask() { + let a = _mm512_set1_epi8(1); + let b = _mm512_set1_epi8(1); + let m = _mm512_cmpge_epu8_mask(a, b); + assert_eq!( + m, + 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 + ); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_mask_cmpge_epu8_mask() { + let a = _mm512_set1_epi8(1); + let b = _mm512_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmple_epu8_mask(mask, a, b); + let r = _mm512_mask_cmpge_epu8_mask(mask, a, b); assert_eq!( r, 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 @@ -13766,95 +14927,95 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmple_epu8_mask() { - let a = _mm256_set1_epi8(-1); - let b = _mm256_set1_epi8(-1); - let m = _mm256_cmple_epu8_mask(a, b); + unsafe fn test_mm256_cmpge_epu8_mask() { + let a = _mm256_set1_epi8(1); + let b = _mm256_set1_epi8(1); + let m = _mm256_cmpge_epu8_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmple_epu8_mask() { - let a = _mm256_set1_epi8(-1); - let b = _mm256_set1_epi8(-1); + unsafe fn test_mm256_mask_cmpge_epu8_mask() { + let a = _mm256_set1_epi8(1); + let b = _mm256_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmple_epu8_mask(mask, a, b); + let r = _mm256_mask_cmpge_epu8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmple_epu8_mask() { - let a = _mm_set1_epi8(-1); - let b = _mm_set1_epi8(-1); - let m = _mm_cmple_epu8_mask(a, b); + unsafe fn test_mm_cmpge_epu8_mask() { + let a = _mm_set1_epi8(1); + let b = _mm_set1_epi8(1); + let m = _mm_cmpge_epu8_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmple_epu8_mask() { - let a = _mm_set1_epi8(-1); - let b = _mm_set1_epi8(-1); + unsafe fn test_mm_mask_cmpge_epu8_mask() { + let a = _mm_set1_epi8(1); + let b = _mm_set1_epi8(1); let mask = 0b01010101_01010101; - let r = _mm_mask_cmple_epu8_mask(mask, a, b); + let r = _mm_mask_cmpge_epu8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmple_epi16_mask() { + unsafe fn test_mm512_cmpge_epi16_mask() { let a = _mm512_set1_epi16(-1); let b = _mm512_set1_epi16(-1); - let m = _mm512_cmple_epi16_mask(a, b); + let m = _mm512_cmpge_epi16_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmple_epi16_mask() { + unsafe fn test_mm512_mask_cmpge_epi16_mask() { let a = _mm512_set1_epi16(-1); let b = _mm512_set1_epi16(-1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmple_epi16_mask(mask, a, b); + let r = _mm512_mask_cmpge_epi16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmple_epi16_mask() { + unsafe fn test_mm256_cmpge_epi16_mask() { let a = _mm256_set1_epi16(-1); let b = _mm256_set1_epi16(-1); - let m = _mm256_cmple_epi16_mask(a, b); + let m = _mm256_cmpge_epi16_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmple_epi16_mask() { + unsafe fn test_mm256_mask_cmpge_epi16_mask() { let a = _mm256_set1_epi16(-1); let b = _mm256_set1_epi16(-1); let mask = 0b01010101_01010101; - let r = _mm256_mask_cmple_epi16_mask(mask, a, b); + let r = _mm256_mask_cmpge_epi16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmple_epi16_mask() { + unsafe fn test_mm_cmpge_epi16_mask() { let a = _mm_set1_epi16(-1); let b = _mm_set1_epi16(-1); - let m = _mm_cmple_epi16_mask(a, b); + let m = _mm_cmpge_epi16_mask(a, b); assert_eq!(m, 0b11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmple_epi16_mask() { + unsafe fn test_mm_mask_cmpge_epi16_mask() { let a = _mm_set1_epi16(-1); let b = _mm_set1_epi16(-1); let mask = 0b01010101; - let r = _mm_mask_cmple_epi16_mask(mask, a, b); + let r = _mm_mask_cmpge_epi16_mask(mask, a, b); assert_eq!(r, 0b01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmple_epi8_mask() { + unsafe fn test_mm512_cmpge_epi8_mask() { let a = _mm512_set1_epi8(-1); let b = _mm512_set1_epi8(-1); - let m = _mm512_cmple_epi8_mask(a, b); + let m = _mm512_cmpge_epi8_mask(a, b); assert_eq!( m, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 @@ -13862,11 +15023,11 @@ mod tests { } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmple_epi8_mask() { + unsafe fn test_mm512_mask_cmpge_epi8_mask() { let a = _mm512_set1_epi8(-1); let b = _mm512_set1_epi8(-1); let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmple_epi8_mask(mask, a, b); + let r = _mm512_mask_cmpge_epi8_mask(mask, a, b); assert_eq!( r, 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 @@ -13874,95 +15035,95 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmple_epi8_mask() { + unsafe fn test_mm256_cmpge_epi8_mask() { let a = _mm256_set1_epi8(-1); let b = _mm256_set1_epi8(-1); - let m = _mm256_cmple_epi8_mask(a, b); + let m = _mm256_cmpge_epi8_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmple_epi8_mask() { + unsafe fn test_mm256_mask_cmpge_epi8_mask() { let a = _mm256_set1_epi8(-1); let b = _mm256_set1_epi8(-1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmple_epi8_mask(mask, a, b); + let r = _mm256_mask_cmpge_epi8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmple_epi8_mask() { + unsafe fn test_mm_cmpge_epi8_mask() { let a = _mm_set1_epi8(-1); let b = _mm_set1_epi8(-1); - let m = _mm_cmple_epi8_mask(a, b); + let m = _mm_cmpge_epi8_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmple_epi8_mask() { + unsafe fn test_mm_mask_cmpge_epi8_mask() { let a = _mm_set1_epi8(-1); let b = _mm_set1_epi8(-1); let mask = 0b01010101_01010101; - let r = _mm_mask_cmple_epi8_mask(mask, a, b); + let r = _mm_mask_cmpge_epi8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpge_epu16_mask() { + unsafe fn test_mm512_cmpeq_epu16_mask() { let a = _mm512_set1_epi16(1); let b = _mm512_set1_epi16(1); - let m = _mm512_cmpge_epu16_mask(a, b); + let m = _mm512_cmpeq_epu16_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpge_epu16_mask() { + unsafe fn test_mm512_mask_cmpeq_epu16_mask() { let a = _mm512_set1_epi16(1); let b = _mm512_set1_epi16(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpge_epu16_mask(mask, a, b); + let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpge_epu16_mask() { + unsafe fn test_mm256_cmpeq_epu16_mask() { let a = _mm256_set1_epi16(1); let b = _mm256_set1_epi16(1); - let m = _mm256_cmpge_epu16_mask(a, b); + let m = _mm256_cmpeq_epu16_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpge_epu16_mask() { + unsafe fn test_mm256_mask_cmpeq_epu16_mask() { let a = _mm256_set1_epi16(1); let b = _mm256_set1_epi16(1); let mask = 0b01010101_01010101; - let r = _mm256_mask_cmpge_epu16_mask(mask, a, b); + let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpge_epu16_mask() { + unsafe fn test_mm_cmpeq_epu16_mask() { let a = _mm_set1_epi16(1); let b = _mm_set1_epi16(1); - let m = _mm_cmpge_epu16_mask(a, b); + let m = _mm_cmpeq_epu16_mask(a, b); assert_eq!(m, 0b11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpge_epu16_mask() { + unsafe fn test_mm_mask_cmpeq_epu16_mask() { let a = _mm_set1_epi16(1); let b = _mm_set1_epi16(1); let mask = 0b01010101; - let r = _mm_mask_cmpge_epu16_mask(mask, a, b); + let r = _mm_mask_cmpeq_epu16_mask(mask, a, b); assert_eq!(r, 0b01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpge_epu8_mask() { + unsafe fn test_mm512_cmpeq_epu8_mask() { let a = _mm512_set1_epi8(1); let b = _mm512_set1_epi8(1); - let m = _mm512_cmpge_epu8_mask(a, b); + let m = _mm512_cmpeq_epu8_mask(a, b); assert_eq!( m, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 @@ -13970,11 +15131,11 @@ mod tests { } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpge_epu8_mask() { + unsafe fn test_mm512_mask_cmpeq_epu8_mask() { let a = _mm512_set1_epi8(1); let b = _mm512_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpge_epu8_mask(mask, a, b); + let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b); assert_eq!( r, 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 @@ -13982,95 +15143,203 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpge_epu8_mask() { + unsafe fn test_mm256_cmpeq_epu8_mask() { let a = _mm256_set1_epi8(1); let b = _mm256_set1_epi8(1); - let m = _mm256_cmpge_epu8_mask(a, b); + let m = _mm256_cmpeq_epu8_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpge_epu8_mask() { + unsafe fn test_mm256_mask_cmpeq_epu8_mask() { let a = _mm256_set1_epi8(1); let b = _mm256_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmpge_epu8_mask(mask, a, b); + let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpge_epu8_mask() { + unsafe fn test_mm_cmpeq_epu8_mask() { let a = _mm_set1_epi8(1); let b = _mm_set1_epi8(1); - let m = _mm_cmpge_epu8_mask(a, b); + let m = _mm_cmpeq_epu8_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpge_epu8_mask() { + unsafe fn test_mm_mask_cmpeq_epu8_mask() { let a = _mm_set1_epi8(1); let b = _mm_set1_epi8(1); let mask = 0b01010101_01010101; - let r = _mm_mask_cmpge_epu8_mask(mask, a, b); + let r = _mm_mask_cmpeq_epu8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpge_epi16_mask() { + unsafe fn test_mm512_cmpeq_epi16_mask() { let a = _mm512_set1_epi16(-1); let b = _mm512_set1_epi16(-1); - let m = _mm512_cmpge_epi16_mask(a, b); + let m = _mm512_cmpeq_epi16_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpge_epi16_mask() { + unsafe fn test_mm512_mask_cmpeq_epi16_mask() { let a = _mm512_set1_epi16(-1); let b = _mm512_set1_epi16(-1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpge_epi16_mask(mask, a, b); + let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpge_epi16_mask() { + unsafe fn test_mm256_cmpeq_epi16_mask() { let a = _mm256_set1_epi16(-1); let b = _mm256_set1_epi16(-1); - let m = _mm256_cmpge_epi16_mask(a, b); + let m = _mm256_cmpeq_epi16_mask(a, b); + assert_eq!(m, 0b11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_cmpeq_epi16_mask() { + let a = _mm256_set1_epi16(-1); + let b = _mm256_set1_epi16(-1); + let mask = 0b01010101_01010101; + let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_cmpeq_epi16_mask() { + let a = _mm_set1_epi16(-1); + let b = _mm_set1_epi16(-1); + let m = _mm_cmpeq_epi16_mask(a, b); + assert_eq!(m, 0b11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_cmpeq_epi16_mask() { + let a = _mm_set1_epi16(-1); + let b = _mm_set1_epi16(-1); + let mask = 0b01010101; + let r = _mm_mask_cmpeq_epi16_mask(mask, a, b); + assert_eq!(r, 0b01010101); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_cmpeq_epi8_mask() { + let a = _mm512_set1_epi8(-1); + let b = _mm512_set1_epi8(-1); + let m = _mm512_cmpeq_epi8_mask(a, b); + assert_eq!( + m, + 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 + ); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_mask_cmpeq_epi8_mask() { + let a = _mm512_set1_epi8(-1); + let b = _mm512_set1_epi8(-1); + let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; + let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b); + assert_eq!( + r, + 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 + ); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_cmpeq_epi8_mask() { + let a = _mm256_set1_epi8(-1); + let b = _mm256_set1_epi8(-1); + let m = _mm256_cmpeq_epi8_mask(a, b); + assert_eq!(m, 0b11111111_11111111_11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_cmpeq_epi8_mask() { + let a = _mm256_set1_epi8(-1); + let b = _mm256_set1_epi8(-1); + let mask = 0b01010101_01010101_01010101_01010101; + let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101_01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_cmpeq_epi8_mask() { + let a = _mm_set1_epi8(-1); + let b = _mm_set1_epi8(-1); + let m = _mm_cmpeq_epi8_mask(a, b); + assert_eq!(m, 0b11111111_11111111); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_cmpeq_epi8_mask() { + let a = _mm_set1_epi8(-1); + let b = _mm_set1_epi8(-1); + let mask = 0b01010101_01010101; + let r = _mm_mask_cmpeq_epi8_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_cmpneq_epu16_mask() { + let a = _mm512_set1_epi16(2); + let b = _mm512_set1_epi16(1); + let m = _mm512_cmpneq_epu16_mask(a, b); + assert_eq!(m, 0b11111111_11111111_11111111_11111111); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_mask_cmpneq_epu16_mask() { + let a = _mm512_set1_epi16(2); + let b = _mm512_set1_epi16(1); + let mask = 0b01010101_01010101_01010101_01010101; + let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b); + assert_eq!(r, 0b01010101_01010101_01010101_01010101); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_cmpneq_epu16_mask() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(1); + let m = _mm256_cmpneq_epu16_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpge_epi16_mask() { - let a = _mm256_set1_epi16(-1); - let b = _mm256_set1_epi16(-1); + unsafe fn test_mm256_mask_cmpneq_epu16_mask() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(1); let mask = 0b01010101_01010101; - let r = _mm256_mask_cmpge_epi16_mask(mask, a, b); + let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpge_epi16_mask() { - let a = _mm_set1_epi16(-1); - let b = _mm_set1_epi16(-1); - let m = _mm_cmpge_epi16_mask(a, b); + unsafe fn test_mm_cmpneq_epu16_mask() { + let a = _mm_set1_epi16(2); + let b = _mm_set1_epi16(1); + let m = _mm_cmpneq_epu16_mask(a, b); assert_eq!(m, 0b11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpge_epi16_mask() { - let a = _mm_set1_epi16(-1); - let b = _mm_set1_epi16(-1); + unsafe fn test_mm_mask_cmpneq_epu16_mask() { + let a = _mm_set1_epi16(2); + let b = _mm_set1_epi16(1); let mask = 0b01010101; - let r = _mm_mask_cmpge_epi16_mask(mask, a, b); + let r = _mm_mask_cmpneq_epu16_mask(mask, a, b); assert_eq!(r, 0b01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpge_epi8_mask() { - let a = _mm512_set1_epi8(-1); - let b = _mm512_set1_epi8(-1); - let m = _mm512_cmpge_epi8_mask(a, b); + unsafe fn test_mm512_cmpneq_epu8_mask() { + let a = _mm512_set1_epi8(2); + let b = _mm512_set1_epi8(1); + let m = _mm512_cmpneq_epu8_mask(a, b); assert_eq!( m, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 @@ -14078,11 +15347,11 @@ mod tests { } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpge_epi8_mask() { - let a = _mm512_set1_epi8(-1); - let b = _mm512_set1_epi8(-1); + unsafe fn test_mm512_mask_cmpneq_epu8_mask() { + let a = _mm512_set1_epi8(2); + let b = _mm512_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpge_epi8_mask(mask, a, b); + let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b); assert_eq!( r, 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 @@ -14090,95 +15359,95 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpge_epi8_mask() { - let a = _mm256_set1_epi8(-1); - let b = _mm256_set1_epi8(-1); - let m = _mm256_cmpge_epi8_mask(a, b); + unsafe fn test_mm256_cmpneq_epu8_mask() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(1); + let m = _mm256_cmpneq_epu8_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpge_epi8_mask() { - let a = _mm256_set1_epi8(-1); - let b = _mm256_set1_epi8(-1); + unsafe fn test_mm256_mask_cmpneq_epu8_mask() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmpge_epi8_mask(mask, a, b); + let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpge_epi8_mask() { - let a = _mm_set1_epi8(-1); - let b = _mm_set1_epi8(-1); - let m = _mm_cmpge_epi8_mask(a, b); + unsafe fn test_mm_cmpneq_epu8_mask() { + let a = _mm_set1_epi8(2); + let b = _mm_set1_epi8(1); + let m = _mm_cmpneq_epu8_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpge_epi8_mask() { - let a = _mm_set1_epi8(-1); - let b = _mm_set1_epi8(-1); + unsafe fn test_mm_mask_cmpneq_epu8_mask() { + let a = _mm_set1_epi8(2); + let b = _mm_set1_epi8(1); let mask = 0b01010101_01010101; - let r = _mm_mask_cmpge_epi8_mask(mask, a, b); + let r = _mm_mask_cmpneq_epu8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpeq_epu16_mask() { + unsafe fn test_mm512_cmpneq_epi16_mask() { let a = _mm512_set1_epi16(1); - let b = _mm512_set1_epi16(1); - let m = _mm512_cmpeq_epu16_mask(a, b); + let b = _mm512_set1_epi16(-1); + let m = _mm512_cmpneq_epi16_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpeq_epu16_mask() { + unsafe fn test_mm512_mask_cmpneq_epi16_mask() { let a = _mm512_set1_epi16(1); - let b = _mm512_set1_epi16(1); + let b = _mm512_set1_epi16(-1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpeq_epu16_mask(mask, a, b); + let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpeq_epu16_mask() { + unsafe fn test_mm256_cmpneq_epi16_mask() { let a = _mm256_set1_epi16(1); - let b = _mm256_set1_epi16(1); - let m = _mm256_cmpeq_epu16_mask(a, b); + let b = _mm256_set1_epi16(-1); + let m = _mm256_cmpneq_epi16_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpeq_epu16_mask() { + unsafe fn test_mm256_mask_cmpneq_epi16_mask() { let a = _mm256_set1_epi16(1); - let b = _mm256_set1_epi16(1); + let b = _mm256_set1_epi16(-1); let mask = 0b01010101_01010101; - let r = _mm256_mask_cmpeq_epu16_mask(mask, a, b); + let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpeq_epu16_mask() { + unsafe fn test_mm_cmpneq_epi16_mask() { let a = _mm_set1_epi16(1); - let b = _mm_set1_epi16(1); - let m = _mm_cmpeq_epu16_mask(a, b); + let b = _mm_set1_epi16(-1); + let m = _mm_cmpneq_epi16_mask(a, b); assert_eq!(m, 0b11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpeq_epu16_mask() { + unsafe fn test_mm_mask_cmpneq_epi16_mask() { let a = _mm_set1_epi16(1); - let b = _mm_set1_epi16(1); + let b = _mm_set1_epi16(-1); let mask = 0b01010101; - let r = _mm_mask_cmpeq_epu16_mask(mask, a, b); + let r = _mm_mask_cmpneq_epi16_mask(mask, a, b); assert_eq!(r, 0b01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpeq_epu8_mask() { + unsafe fn test_mm512_cmpneq_epi8_mask() { let a = _mm512_set1_epi8(1); - let b = _mm512_set1_epi8(1); - let m = _mm512_cmpeq_epu8_mask(a, b); + let b = _mm512_set1_epi8(-1); + let m = _mm512_cmpneq_epi8_mask(a, b); assert_eq!( m, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 @@ -14186,11 +15455,11 @@ mod tests { } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpeq_epu8_mask() { + unsafe fn test_mm512_mask_cmpneq_epi8_mask() { let a = _mm512_set1_epi8(1); - let b = _mm512_set1_epi8(1); + let b = _mm512_set1_epi8(-1); let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpeq_epu8_mask(mask, a, b); + let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b); assert_eq!( r, 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 @@ -14198,95 +15467,95 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpeq_epu8_mask() { + unsafe fn test_mm256_cmpneq_epi8_mask() { let a = _mm256_set1_epi8(1); - let b = _mm256_set1_epi8(1); - let m = _mm256_cmpeq_epu8_mask(a, b); + let b = _mm256_set1_epi8(-1); + let m = _mm256_cmpneq_epi8_mask(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpeq_epu8_mask() { + unsafe fn test_mm256_mask_cmpneq_epi8_mask() { let a = _mm256_set1_epi8(1); - let b = _mm256_set1_epi8(1); + let b = _mm256_set1_epi8(-1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmpeq_epu8_mask(mask, a, b); + let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpeq_epu8_mask() { + unsafe fn test_mm_cmpneq_epi8_mask() { let a = _mm_set1_epi8(1); - let b = _mm_set1_epi8(1); - let m = _mm_cmpeq_epu8_mask(a, b); + let b = _mm_set1_epi8(-1); + let m = _mm_cmpneq_epi8_mask(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpeq_epu8_mask() { + unsafe fn test_mm_mask_cmpneq_epi8_mask() { let a = _mm_set1_epi8(1); - let b = _mm_set1_epi8(1); + let b = _mm_set1_epi8(-1); let mask = 0b01010101_01010101; - let r = _mm_mask_cmpeq_epu8_mask(mask, a, b); + let r = _mm_mask_cmpneq_epi8_mask(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpeq_epi16_mask() { - let a = _mm512_set1_epi16(-1); - let b = _mm512_set1_epi16(-1); - let m = _mm512_cmpeq_epi16_mask(a, b); + unsafe fn test_mm512_cmp_epu16_mask() { + let a = _mm512_set1_epi16(0); + let b = _mm512_set1_epi16(1); + let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpeq_epi16_mask() { - let a = _mm512_set1_epi16(-1); - let b = _mm512_set1_epi16(-1); + unsafe fn test_mm512_mask_cmp_epu16_mask() { + let a = _mm512_set1_epi16(0); + let b = _mm512_set1_epi16(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpeq_epi16_mask(mask, a, b); + let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpeq_epi16_mask() { - let a = _mm256_set1_epi16(-1); - let b = _mm256_set1_epi16(-1); - let m = _mm256_cmpeq_epi16_mask(a, b); + unsafe fn test_mm256_cmp_epu16_mask() { + let a = _mm256_set1_epi16(0); + let b = _mm256_set1_epi16(1); + let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpeq_epi16_mask() { - let a = _mm256_set1_epi16(-1); - let b = _mm256_set1_epi16(-1); + unsafe fn test_mm256_mask_cmp_epu16_mask() { + let a = _mm256_set1_epi16(0); + let b = _mm256_set1_epi16(1); let mask = 0b01010101_01010101; - let r = _mm256_mask_cmpeq_epi16_mask(mask, a, b); + let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpeq_epi16_mask() { - let a = _mm_set1_epi16(-1); - let b = _mm_set1_epi16(-1); - let m = _mm_cmpeq_epi16_mask(a, b); + unsafe fn test_mm_cmp_epu16_mask() { + let a = _mm_set1_epi16(0); + let b = _mm_set1_epi16(1); + let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpeq_epi16_mask() { - let a = _mm_set1_epi16(-1); - let b = _mm_set1_epi16(-1); + unsafe fn test_mm_mask_cmp_epu16_mask() { + let a = _mm_set1_epi16(0); + let b = _mm_set1_epi16(1); let mask = 0b01010101; - let r = _mm_mask_cmpeq_epi16_mask(mask, a, b); + let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpeq_epi8_mask() { - let a = _mm512_set1_epi8(-1); - let b = _mm512_set1_epi8(-1); - let m = _mm512_cmpeq_epi8_mask(a, b); + unsafe fn test_mm512_cmp_epu8_mask() { + let a = _mm512_set1_epi8(0); + let b = _mm512_set1_epi8(1); + let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b); assert_eq!( m, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 @@ -14294,11 +15563,11 @@ mod tests { } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpeq_epi8_mask() { - let a = _mm512_set1_epi8(-1); - let b = _mm512_set1_epi8(-1); + unsafe fn test_mm512_mask_cmp_epu8_mask() { + let a = _mm512_set1_epi8(0); + let b = _mm512_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpeq_epi8_mask(mask, a, b); + let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!( r, 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 @@ -14306,95 +15575,95 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpeq_epi8_mask() { - let a = _mm256_set1_epi8(-1); - let b = _mm256_set1_epi8(-1); - let m = _mm256_cmpeq_epi8_mask(a, b); + unsafe fn test_mm256_cmp_epu8_mask() { + let a = _mm256_set1_epi8(0); + let b = _mm256_set1_epi8(1); + let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpeq_epi8_mask() { - let a = _mm256_set1_epi8(-1); - let b = _mm256_set1_epi8(-1); + unsafe fn test_mm256_mask_cmp_epu8_mask() { + let a = _mm256_set1_epi8(0); + let b = _mm256_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmpeq_epi8_mask(mask, a, b); + let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpeq_epi8_mask() { - let a = _mm_set1_epi8(-1); - let b = _mm_set1_epi8(-1); - let m = _mm_cmpeq_epi8_mask(a, b); + unsafe fn test_mm_cmp_epu8_mask() { + let a = _mm_set1_epi8(0); + let b = _mm_set1_epi8(1); + let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpeq_epi8_mask() { - let a = _mm_set1_epi8(-1); - let b = _mm_set1_epi8(-1); + unsafe fn test_mm_mask_cmp_epu8_mask() { + let a = _mm_set1_epi8(0); + let b = _mm_set1_epi8(1); let mask = 0b01010101_01010101; - let r = _mm_mask_cmpeq_epi8_mask(mask, a, b); + let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpneq_epu16_mask() { - let a = _mm512_set1_epi16(2); + unsafe fn test_mm512_cmp_epi16_mask() { + let a = _mm512_set1_epi16(0); let b = _mm512_set1_epi16(1); - let m = _mm512_cmpneq_epu16_mask(a, b); + let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpneq_epu16_mask() { - let a = _mm512_set1_epi16(2); + unsafe fn test_mm512_mask_cmp_epi16_mask() { + let a = _mm512_set1_epi16(0); let b = _mm512_set1_epi16(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpneq_epu16_mask(mask, a, b); + let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpneq_epu16_mask() { - let a = _mm256_set1_epi16(2); + unsafe fn test_mm256_cmp_epi16_mask() { + let a = _mm256_set1_epi16(0); let b = _mm256_set1_epi16(1); - let m = _mm256_cmpneq_epu16_mask(a, b); + let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpneq_epu16_mask() { - let a = _mm256_set1_epi16(2); + unsafe fn test_mm256_mask_cmp_epi16_mask() { + let a = _mm256_set1_epi16(0); let b = _mm256_set1_epi16(1); let mask = 0b01010101_01010101; - let r = _mm256_mask_cmpneq_epu16_mask(mask, a, b); + let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpneq_epu16_mask() { - let a = _mm_set1_epi16(2); + unsafe fn test_mm_cmp_epi16_mask() { + let a = _mm_set1_epi16(0); let b = _mm_set1_epi16(1); - let m = _mm_cmpneq_epu16_mask(a, b); + let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpneq_epu16_mask() { - let a = _mm_set1_epi16(2); + unsafe fn test_mm_mask_cmp_epi16_mask() { + let a = _mm_set1_epi16(0); let b = _mm_set1_epi16(1); let mask = 0b01010101; - let r = _mm_mask_cmpneq_epu16_mask(mask, a, b); + let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101); } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpneq_epu8_mask() { - let a = _mm512_set1_epi8(2); + unsafe fn test_mm512_cmp_epi8_mask() { + let a = _mm512_set1_epi8(0); let b = _mm512_set1_epi8(1); - let m = _mm512_cmpneq_epu8_mask(a, b); + let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b); assert_eq!( m, 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 @@ -14402,11 +15671,11 @@ mod tests { } #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpneq_epu8_mask() { - let a = _mm512_set1_epi8(2); + unsafe fn test_mm512_mask_cmp_epi8_mask() { + let a = _mm512_set1_epi8(0); let b = _mm512_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpneq_epu8_mask(mask, a, b); + let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!( r, 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 @@ -14414,361 +15683,527 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpneq_epu8_mask() { - let a = _mm256_set1_epi8(2); + unsafe fn test_mm256_cmp_epi8_mask() { + let a = _mm256_set1_epi8(0); let b = _mm256_set1_epi8(1); - let m = _mm256_cmpneq_epu8_mask(a, b); + let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111_11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpneq_epu8_mask() { - let a = _mm256_set1_epi8(2); + unsafe fn test_mm256_mask_cmp_epi8_mask() { + let a = _mm256_set1_epi8(0); let b = _mm256_set1_epi8(1); let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmpneq_epu8_mask(mask, a, b); + let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101_01010101_01010101); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpneq_epu8_mask() { - let a = _mm_set1_epi8(2); + unsafe fn test_mm_cmp_epi8_mask() { + let a = _mm_set1_epi8(0); let b = _mm_set1_epi8(1); - let m = _mm_cmpneq_epu8_mask(a, b); + let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b); assert_eq!(m, 0b11111111_11111111); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpneq_epu8_mask() { - let a = _mm_set1_epi8(2); + unsafe fn test_mm_mask_cmp_epi8_mask() { + let a = _mm_set1_epi8(0); let b = _mm_set1_epi8(1); let mask = 0b01010101_01010101; - let r = _mm_mask_cmpneq_epu8_mask(mask, a, b); + let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b); assert_eq!(r, 0b01010101_01010101); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpneq_epi16_mask() { - let a = _mm512_set1_epi16(1); - let b = _mm512_set1_epi16(-1); - let m = _mm512_cmpneq_epi16_mask(a, b); - assert_eq!(m, 0b11111111_11111111_11111111_11111111); - } - - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpneq_epi16_mask() { - let a = _mm512_set1_epi16(1); - let b = _mm512_set1_epi16(-1); - let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpneq_epi16_mask(mask, a, b); - assert_eq!(r, 0b01010101_01010101_01010101_01010101); - } - #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpneq_epi16_mask() { + unsafe fn test_mm256_reduce_add_epi16() { let a = _mm256_set1_epi16(1); - let b = _mm256_set1_epi16(-1); - let m = _mm256_cmpneq_epi16_mask(a, b); - assert_eq!(m, 0b11111111_11111111); + let e = _mm256_reduce_add_epi16(a); + assert_eq!(16, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpneq_epi16_mask() { + unsafe fn test_mm256_mask_reduce_add_epi16() { let a = _mm256_set1_epi16(1); - let b = _mm256_set1_epi16(-1); - let mask = 0b01010101_01010101; - let r = _mm256_mask_cmpneq_epi16_mask(mask, a, b); - assert_eq!(r, 0b01010101_01010101); + let e = _mm256_mask_reduce_add_epi16(0b11111111_00000000, a); + assert_eq!(8, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpneq_epi16_mask() { + unsafe fn test_mm_reduce_add_epi16() { let a = _mm_set1_epi16(1); - let b = _mm_set1_epi16(-1); - let m = _mm_cmpneq_epi16_mask(a, b); - assert_eq!(m, 0b11111111); + let e = _mm_reduce_add_epi16(a); + assert_eq!(8, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpneq_epi16_mask() { + unsafe fn test_mm_mask_reduce_add_epi16() { let a = _mm_set1_epi16(1); - let b = _mm_set1_epi16(-1); - let mask = 0b01010101; - let r = _mm_mask_cmpneq_epi16_mask(mask, a, b); - assert_eq!(r, 0b01010101); + let e = _mm_mask_reduce_add_epi16(0b11110000, a); + assert_eq!(4, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmpneq_epi8_mask() { - let a = _mm512_set1_epi8(1); - let b = _mm512_set1_epi8(-1); - let m = _mm512_cmpneq_epi8_mask(a, b); - assert_eq!( - m, - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_add_epi8() { + let a = _mm256_set1_epi8(1); + let e = _mm256_reduce_add_epi8(a); + assert_eq!(32, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_add_epi8() { + let a = _mm256_set1_epi8(1); + let e = _mm256_mask_reduce_add_epi8(0b11111111_00000000_11111111_00000000, a); + assert_eq!(16, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_add_epi8() { + let a = _mm_set1_epi8(1); + let e = _mm_reduce_add_epi8(a); + assert_eq!(16, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_add_epi8() { + let a = _mm_set1_epi8(1); + let e = _mm_mask_reduce_add_epi8(0b11111111_00000000, a); + assert_eq!(8, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_and_epi16() { + let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm256_reduce_and_epi16(a); + assert_eq!(0, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_and_epi16() { + let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm256_mask_reduce_and_epi16(0b11111111_00000000, a); + assert_eq!(1, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_and_epi16() { + let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2); + let e = _mm_reduce_and_epi16(a); + assert_eq!(0, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_and_epi16() { + let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2); + let e = _mm_mask_reduce_and_epi16(0b11110000, a); + assert_eq!(1, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_and_epi8() { + let a = _mm256_set_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, ); + let e = _mm256_reduce_and_epi8(a); + assert_eq!(0, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmpneq_epi8_mask() { - let a = _mm512_set1_epi8(1); - let b = _mm512_set1_epi8(-1); - let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmpneq_epi8_mask(mask, a, b); - assert_eq!( - r, - 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_and_epi8() { + let a = _mm256_set_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, ); + let e = _mm256_mask_reduce_and_epi8(0b11111111_00000000_11111111_00000000, a); + assert_eq!(1, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmpneq_epi8_mask() { - let a = _mm256_set1_epi8(1); - let b = _mm256_set1_epi8(-1); - let m = _mm256_cmpneq_epi8_mask(a, b); - assert_eq!(m, 0b11111111_11111111_11111111_11111111); + unsafe fn test_mm_reduce_and_epi8() { + let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm_reduce_and_epi8(a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmpneq_epi8_mask() { - let a = _mm256_set1_epi8(1); - let b = _mm256_set1_epi8(-1); - let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmpneq_epi8_mask(mask, a, b); - assert_eq!(r, 0b01010101_01010101_01010101_01010101); + unsafe fn test_mm_mask_reduce_and_epi8() { + let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm_mask_reduce_and_epi8(0b11111111_00000000, a); + assert_eq!(1, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmpneq_epi8_mask() { - let a = _mm_set1_epi8(1); - let b = _mm_set1_epi8(-1); - let m = _mm_cmpneq_epi8_mask(a, b); - assert_eq!(m, 0b11111111_11111111); + unsafe fn test_mm256_reduce_mul_epi16() { + let a = _mm256_set_epi16(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1); + let e = _mm256_reduce_mul_epi16(a); + assert_eq!(256, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmpneq_epi8_mask() { - let a = _mm_set1_epi8(1); - let b = _mm_set1_epi8(-1); - let mask = 0b01010101_01010101; - let r = _mm_mask_cmpneq_epi8_mask(mask, a, b); - assert_eq!(r, 0b01010101_01010101); + unsafe fn test_mm256_mask_reduce_mul_epi16() { + let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm256_mask_reduce_mul_epi16(0b11111111_00000000, a); + assert_eq!(1, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmp_epu16_mask() { - let a = _mm512_set1_epi16(0); - let b = _mm512_set1_epi16(1); - let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111_11111111_11111111); + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_mul_epi16() { + let a = _mm_set_epi16(2, 2, 2, 2, 1, 1, 1, 1); + let e = _mm_reduce_mul_epi16(a); + assert_eq!(16, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_mul_epi16() { + let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2); + let e = _mm_mask_reduce_mul_epi16(0b11110000, a); + assert_eq!(1, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_mul_epi8() { + let a = _mm256_set_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, + ); + let e = _mm256_reduce_mul_epi8(a); + assert_eq!(64, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_mul_epi8() { + let a = _mm256_set_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, + ); + let e = _mm256_mask_reduce_mul_epi8(0b11111111_00000000_11111111_00000000, a); + assert_eq!(1, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_mul_epi8() { + let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2); + let e = _mm_reduce_mul_epi8(a); + assert_eq!(8, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_mul_epi8() { + let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2); + let e = _mm_mask_reduce_mul_epi8(0b11111111_00000000, a); + assert_eq!(1, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_max_epi16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i16 = _mm256_reduce_max_epi16(a); + assert_eq!(15, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_max_epi16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i16 = _mm256_mask_reduce_max_epi16(0b11111111_00000000, a); + assert_eq!(7, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_max_epi16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: i16 = _mm_reduce_max_epi16(a); + assert_eq!(7, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_max_epi16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: i16 = _mm_mask_reduce_max_epi16(0b11110000, a); + assert_eq!(3, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_max_epi8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + let e: i8 = _mm256_reduce_max_epi8(a); + assert_eq!(31, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_max_epi8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + let e: i8 = _mm256_mask_reduce_max_epi8(0b1111111111111111_0000000000000000, a); + assert_eq!(15, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_max_epi8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i8 = _mm_reduce_max_epi8(a); + assert_eq!(15, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_max_epi8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i8 = _mm_mask_reduce_max_epi8(0b11111111_00000000, a); + assert_eq!(7, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_max_epu16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u16 = _mm256_reduce_max_epu16(a); + assert_eq!(15, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_max_epu16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u16 = _mm256_mask_reduce_max_epu16(0b11111111_00000000, a); + assert_eq!(7, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_max_epu16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: u16 = _mm_reduce_max_epu16(a); + assert_eq!(7, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_max_epu16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: u16 = _mm_mask_reduce_max_epu16(0b11110000, a); + assert_eq!(3, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_max_epu8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + let e: u8 = _mm256_reduce_max_epu8(a); + assert_eq!(31, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_max_epu8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + let e: u8 = _mm256_mask_reduce_max_epu8(0b1111111111111111_0000000000000000, a); + assert_eq!(15, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_max_epu8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u8 = _mm_reduce_max_epu8(a); + assert_eq!(15, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmp_epu16_mask() { - let a = _mm512_set1_epi16(0); - let b = _mm512_set1_epi16(1); - let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101_01010101_01010101); + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_max_epu8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u8 = _mm_mask_reduce_max_epu8(0b11111111_00000000, a); + assert_eq!(7, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmp_epu16_mask() { - let a = _mm256_set1_epi16(0); - let b = _mm256_set1_epi16(1); - let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111); + unsafe fn test_mm256_reduce_min_epi16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i16 = _mm256_reduce_min_epi16(a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmp_epu16_mask() { - let a = _mm256_set1_epi16(0); - let b = _mm256_set1_epi16(1); - let mask = 0b01010101_01010101; - let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101); + unsafe fn test_mm256_mask_reduce_min_epi16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i16 = _mm256_mask_reduce_min_epi16(0b11111111_00000000, a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmp_epu16_mask() { - let a = _mm_set1_epi16(0); - let b = _mm_set1_epi16(1); - let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111); + unsafe fn test_mm_reduce_min_epi16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: i16 = _mm_reduce_min_epi16(a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmp_epu16_mask() { - let a = _mm_set1_epi16(0); - let b = _mm_set1_epi16(1); - let mask = 0b01010101; - let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101); + unsafe fn test_mm_mask_reduce_min_epi16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: i16 = _mm_mask_reduce_min_epi16(0b11110000, a); + assert_eq!(0, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmp_epu8_mask() { - let a = _mm512_set1_epi8(0); - let b = _mm512_set1_epi8(1); - let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!( - m, - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_min_epi8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, ); + let e: i8 = _mm256_reduce_min_epi8(a); + assert_eq!(0, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmp_epu8_mask() { - let a = _mm512_set1_epi8(0); - let b = _mm512_set1_epi8(1); - let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!( - r, - 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_min_epi8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, ); + let e: i8 = _mm256_mask_reduce_min_epi8(0b1111111111111111_0000000000000000, a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmp_epu8_mask() { - let a = _mm256_set1_epi8(0); - let b = _mm256_set1_epi8(1); - let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111_11111111_11111111); + unsafe fn test_mm_reduce_min_epi8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i8 = _mm_reduce_min_epi8(a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmp_epu8_mask() { - let a = _mm256_set1_epi8(0); - let b = _mm256_set1_epi8(1); - let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101_01010101_01010101); + unsafe fn test_mm_mask_reduce_min_epi8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: i8 = _mm_mask_reduce_min_epi8(0b11111111_00000000, a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmp_epu8_mask() { - let a = _mm_set1_epi8(0); - let b = _mm_set1_epi8(1); - let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111); + unsafe fn test_mm256_reduce_min_epu16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u16 = _mm256_reduce_min_epu16(a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmp_epu8_mask() { - let a = _mm_set1_epi8(0); - let b = _mm_set1_epi8(1); - let mask = 0b01010101_01010101; - let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101); + unsafe fn test_mm256_mask_reduce_min_epu16() { + let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u16 = _mm256_mask_reduce_min_epu16(0b11111111_00000000, a); + assert_eq!(0, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmp_epi16_mask() { - let a = _mm512_set1_epi16(0); - let b = _mm512_set1_epi16(1); - let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111_11111111_11111111); + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_min_epu16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: u16 = _mm_reduce_min_epu16(a); + assert_eq!(0, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmp_epi16_mask() { - let a = _mm512_set1_epi16(0); - let b = _mm512_set1_epi16(1); - let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101_01010101_01010101); + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_min_epu16() { + let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + let e: u16 = _mm_mask_reduce_min_epu16(0b11110000, a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmp_epi16_mask() { - let a = _mm256_set1_epi16(0); - let b = _mm256_set1_epi16(1); - let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111); + unsafe fn test_mm256_reduce_min_epu8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + let e: u8 = _mm256_reduce_min_epu8(a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmp_epi16_mask() { - let a = _mm256_set1_epi16(0); - let b = _mm256_set1_epi16(1); - let mask = 0b01010101_01010101; - let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101); + unsafe fn test_mm256_mask_reduce_min_epu8() { + let a = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + ); + let e: u8 = _mm256_mask_reduce_min_epu8(0b1111111111111111_0000000000000000, a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmp_epi16_mask() { - let a = _mm_set1_epi16(0); - let b = _mm_set1_epi16(1); - let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111); + unsafe fn test_mm_reduce_min_epu8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u8 = _mm_reduce_min_epu8(a); + assert_eq!(0, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmp_epi16_mask() { - let a = _mm_set1_epi16(0); - let b = _mm_set1_epi16(1); - let mask = 0b01010101; - let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101); + unsafe fn test_mm_mask_reduce_min_epu8() { + let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let e: u8 = _mm_mask_reduce_min_epu8(0b11111111_00000000, a); + assert_eq!(0, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_cmp_epi8_mask() { - let a = _mm512_set1_epi8(0); - let b = _mm512_set1_epi8(1); - let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!( - m, - 0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111 - ); + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_reduce_or_epi16() { + let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm256_reduce_or_epi16(a); + assert_eq!(3, e); } - #[simd_test(enable = "avx512bw")] - unsafe fn test_mm512_mask_cmp_epi8_mask() { - let a = _mm512_set1_epi8(0); - let b = _mm512_set1_epi8(1); - let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101; - let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!( - r, - 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101 - ); + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm256_mask_reduce_or_epi16() { + let a = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm256_mask_reduce_or_epi16(0b11111111_00000000, a); + assert_eq!(1, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_cmp_epi8_mask() { - let a = _mm256_set1_epi8(0); - let b = _mm256_set1_epi8(1); - let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111_11111111_11111111); + unsafe fn test_mm_reduce_or_epi16() { + let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2); + let e = _mm_reduce_or_epi16(a); + assert_eq!(3, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm256_mask_cmp_epi8_mask() { - let a = _mm256_set1_epi8(0); - let b = _mm256_set1_epi8(1); - let mask = 0b01010101_01010101_01010101_01010101; - let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101_01010101_01010101); + unsafe fn test_mm_mask_reduce_or_epi16() { + let a = _mm_set_epi16(1, 1, 1, 1, 2, 2, 2, 2); + let e = _mm_mask_reduce_or_epi16(0b11110000, a); + assert_eq!(1, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_cmp_epi8_mask() { - let a = _mm_set1_epi8(0); - let b = _mm_set1_epi8(1); - let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b); - assert_eq!(m, 0b11111111_11111111); + unsafe fn test_mm256_reduce_or_epi8() { + let a = _mm256_set_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, + ); + let e = _mm256_reduce_or_epi8(a); + assert_eq!(3, e); } #[simd_test(enable = "avx512bw,avx512vl")] - unsafe fn test_mm_mask_cmp_epi8_mask() { - let a = _mm_set1_epi8(0); - let b = _mm_set1_epi8(1); - let mask = 0b01010101_01010101; - let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b); - assert_eq!(r, 0b01010101_01010101); + unsafe fn test_mm256_mask_reduce_or_epi8() { + let a = _mm256_set_epi8( + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, + ); + let e = _mm256_mask_reduce_or_epi8(0b11111111_00000000_11111111_00000000, a); + assert_eq!(1, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_reduce_or_epi8() { + let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm_reduce_or_epi8(a); + assert_eq!(3, e); + } + + #[simd_test(enable = "avx512bw,avx512vl")] + unsafe fn test_mm_mask_reduce_or_epi8() { + let a = _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2); + let e = _mm_mask_reduce_or_epi8(0b11111111_00000000, a); + assert_eq!(1, e); } #[simd_test(enable = "avx512bw")] @@ -18679,6 +20114,22 @@ mod tests { assert_eq_m128i(r, e); } + #[simd_test(enable = "avx512bw")] + unsafe fn test_cvtmask32_u32() { + let a: __mmask32 = 0b11001100_00110011_01100110_10011001; + let r = _cvtmask32_u32(a); + let e: u32 = 0b11001100_00110011_01100110_10011001; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_cvtu32_mask32() { + let a: u32 = 0b11001100_00110011_01100110_10011001; + let r = _cvtu32_mask32(a); + let e: __mmask32 = 0b11001100_00110011_01100110_10011001; + assert_eq!(r, e); + } + #[simd_test(enable = "avx512bw")] unsafe fn test_kadd_mask32() { let a: __mmask32 = 11; @@ -18820,6 +20271,160 @@ mod tests { assert_eq!(r, e); } + #[simd_test(enable = "avx512bw")] + unsafe fn test_kortest_mask32_u8() { + let a: __mmask32 = 0b0110100101101001_0110100101101001; + let b: __mmask32 = 0b1011011010110110_1011011010110110; + let mut all_ones: u8 = 0; + let r = _kortest_mask32_u8(a, b, &mut all_ones); + assert_eq!(r, 0); + assert_eq!(all_ones, 1); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kortest_mask64_u8() { + let a: __mmask64 = 0b0110100101101001_0110100101101001; + let b: __mmask64 = 0b1011011010110110_1011011010110110; + let mut all_ones: u8 = 0; + let r = _kortest_mask64_u8(a, b, &mut all_ones); + assert_eq!(r, 0); + assert_eq!(all_ones, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kortestc_mask32_u8() { + let a: __mmask32 = 0b0110100101101001_0110100101101001; + let b: __mmask32 = 0b1011011010110110_1011011010110110; + let r = _kortestc_mask32_u8(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kortestc_mask64_u8() { + let a: __mmask64 = 0b0110100101101001_0110100101101001; + let b: __mmask64 = 0b1011011010110110_1011011010110110; + let r = _kortestc_mask64_u8(a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kortestz_mask32_u8() { + let a: __mmask32 = 0b0110100101101001_0110100101101001; + let b: __mmask32 = 0b1011011010110110_1011011010110110; + let r = _kortestz_mask32_u8(a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kortestz_mask64_u8() { + let a: __mmask64 = 0b0110100101101001_0110100101101001; + let b: __mmask64 = 0b1011011010110110_1011011010110110; + let r = _kortestz_mask64_u8(a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kshiftli_mask32() { + let a: __mmask32 = 0b0110100101101001_0110100101101001; + let r = _kshiftli_mask32::<3>(a); + let e: __mmask32 = 0b0100101101001011_0100101101001000; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kshiftli_mask64() { + let a: __mmask64 = 0b0110100101101001_0110100101101001; + let r = _kshiftli_mask64::<3>(a); + let e: __mmask64 = 0b0110100101101001011_0100101101001000; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kshiftri_mask32() { + let a: __mmask32 = 0b0110100101101001_0110100101101001; + let r = _kshiftri_mask32::<3>(a); + let e: __mmask32 = 0b0000110100101101_0010110100101101; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_kshiftri_mask64() { + let a: __mmask64 = 0b0110100101101001011_0100101101001000; + let r = _kshiftri_mask64::<3>(a); + let e: __mmask64 = 0b0110100101101001_0110100101101001; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_ktest_mask32_u8() { + let a: __mmask32 = 0b0110100100111100_0110100100111100; + let b: __mmask32 = 0b1001011011000011_1001011011000011; + let mut and_not: u8 = 0; + let r = _ktest_mask32_u8(a, b, &mut and_not); + assert_eq!(r, 1); + assert_eq!(and_not, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_ktestc_mask32_u8() { + let a: __mmask32 = 0b0110100100111100_0110100100111100; + let b: __mmask32 = 0b1001011011000011_1001011011000011; + let r = _ktestc_mask32_u8(a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_ktestz_mask32_u8() { + let a: __mmask32 = 0b0110100100111100_0110100100111100; + let b: __mmask32 = 0b1001011011000011_1001011011000011; + let r = _ktestz_mask32_u8(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_ktest_mask64_u8() { + let a: __mmask64 = 0b0110100100111100_0110100100111100; + let b: __mmask64 = 0b1001011011000011_1001011011000011; + let mut and_not: u8 = 0; + let r = _ktest_mask64_u8(a, b, &mut and_not); + assert_eq!(r, 1); + assert_eq!(and_not, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_ktestc_mask64_u8() { + let a: __mmask64 = 0b0110100100111100_0110100100111100; + let b: __mmask64 = 0b1001011011000011_1001011011000011; + let r = _ktestc_mask64_u8(a, b); + assert_eq!(r, 0); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_ktestz_mask64_u8() { + let a: __mmask64 = 0b0110100100111100_0110100100111100; + let b: __mmask64 = 0b1001011011000011_1001011011000011; + let r = _ktestz_mask64_u8(a, b); + assert_eq!(r, 1); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_kunpackw() { + let a: u32 = 0x00110011; + let b: u32 = 0x00001011; + let r = _mm512_kunpackw(a, b); + let e: u32 = 0x00111011; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_mm512_kunpackd() { + let a: u64 = 0x11001100_00110011; + let b: u64 = 0x00101110_00001011; + let r = _mm512_kunpackd(a, b); + let e: u64 = 0x00110011_00001011; + assert_eq!(r, e); + } + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_cvtepi16_epi8() { let a = _mm512_set1_epi16(2); diff --git a/crates/core_arch/src/x86_64/avx512bw.rs b/crates/core_arch/src/x86_64/avx512bw.rs new file mode 100644 index 0000000000..798fc4adf6 --- /dev/null +++ b/crates/core_arch/src/x86_64/avx512bw.rs @@ -0,0 +1,45 @@ +use crate::core_arch::x86::*; + +/// Convert 64-bit mask a into an integer value, and store the result in dst. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtmask64_u64) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _cvtmask64_u64(a: __mmask64) -> u64 { + a +} + +/// Convert integer value a into an 64-bit mask, and store the result in k. +/// +/// [Intel's Documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cvtu64_mask64) +#[inline] +#[target_feature(enable = "avx512bw")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _cvtu64_mask64(a: u64) -> __mmask64 { + a +} + +#[cfg(test)] +mod tests { + + use stdarch_test::simd_test; + + use crate::core_arch::{x86::*, x86_64::*}; + + #[simd_test(enable = "avx512bw")] + unsafe fn test_cvtmask64_u64() { + let a: __mmask64 = 0b11001100_00110011_01100110_10011001; + let r = _cvtmask64_u64(a); + let e: u64 = 0b11001100_00110011_01100110_10011001; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512bw")] + unsafe fn test_cvtu64_mask64() { + let a: u64 = 0b11001100_00110011_01100110_10011001; + let r = _cvtu64_mask64(a); + let e: __mmask64 = 0b11001100_00110011_01100110_10011001; + assert_eq!(r, e); + } +} diff --git a/crates/core_arch/src/x86_64/mod.rs b/crates/core_arch/src/x86_64/mod.rs index ff46373d90..fb7bce6871 100644 --- a/crates/core_arch/src/x86_64/mod.rs +++ b/crates/core_arch/src/x86_64/mod.rs @@ -46,6 +46,10 @@ mod avx512f; #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub use self::avx512f::*; +mod avx512bw; +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub use self::avx512bw::*; + mod bswap; #[stable(feature = "simd_x86", since = "1.27.0")] pub use self::bswap::*;