diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 4593ab19d..3558462d6 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -6,6 +6,64 @@ #include "../types/xsimd_neon_register.hpp" #include "../types/xsimd_utils.hpp" +// Wrap intrinsics so we can pass them as function pointers +// - OP: intrinsics name prefix, e.g., vorrq +// - RT: type traits to deduce intrinsics return types +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap { \ + inline RT OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \ + inline RT OP##_s8 (int8x16_t a, int8x16_t b) { return ::OP##_s8 (a, b); } \ + inline RT OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \ + inline RT OP##_s16(int16x8_t a, int16x8_t b) { return ::OP##_s16(a, b); } \ + inline RT OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \ + inline RT OP##_s32(int32x4_t a, int32x4_t b) { return ::OP##_s32(a, b); } \ + } + +#define WRAP_BINARY_INT(OP, RT) \ + WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap { \ + inline RT OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \ + inline RT OP##_s64(int64x2_t a, int64x2_t b) { return ::OP##_s64(a, b); } \ + } + +#define WRAP_BINARY_FLOAT(OP, RT) \ + namespace wrap { \ + inline RT OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \ + } + +#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap { \ + inline uint8x16_t OP##_u8 (uint8x16_t a) { return ::OP##_u8 (a); } \ + inline int8x16_t OP##_s8 (int8x16_t a) { return ::OP##_s8 (a); } \ + inline uint16x8_t OP##_u16(uint16x8_t a) { return ::OP##_u16(a); } \ + inline int16x8_t OP##_s16(int16x8_t a) { return ::OP##_s16(a); } \ + inline uint32x4_t OP##_u32(uint32x4_t a) { return ::OP##_u32(a); } \ + inline int32x4_t OP##_s32(int32x4_t a) { return ::OP##_s32(a); } \ + } + +#define WRAP_UNARY_INT(OP) \ + WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap { \ + inline uint64x2_t OP##_u64(uint64x2_t a) { return ::OP##_u64(a); } \ + inline int64x2_t OP##_s64(int64x2_t a) { return ::OP##_s64(a); } \ + } + +#define WRAP_UNARY_FLOAT(OP) \ + namespace wrap { \ + inline float32x4_t OP##_f32(float32x4_t a) { return ::OP##_f32(a); } \ + } + +// Dummy identity caster to ease coding +inline uint8x16_t vreinterpretq_u8_u8 (uint8x16_t arg) { return arg; } +inline int8x16_t vreinterpretq_s8_s8 (int8x16_t arg) { return arg; } +inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) { return arg; } +inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) { return arg; } +inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) { return arg; } +inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) { return arg; } +inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) { return arg; } +inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) { return arg; } +inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) { return arg; } + namespace xsimd { namespace kernel @@ -551,15 +609,18 @@ namespace xsimd * add * *******/ + WRAP_BINARY_INT(vaddq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type) + template = 0> batch add(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::neon_dispatcher::binary dispatcher = { - std::make_tuple(vaddq_u8, vaddq_s8, vaddq_u16, vaddq_s16, - vaddq_u32, vaddq_s32, vaddq_u64, vaddq_s64, - vaddq_f32) + std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16, + wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64, + wrap::vaddq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -568,15 +629,17 @@ namespace xsimd * sadd * ********/ + WRAP_BINARY_INT(vqaddq, detail::identity_return_type) + template = 0> batch sadd(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::neon_dispatcher::binary dispatcher = { - std::make_tuple(vqaddq_u8, vqaddq_s8, vqaddq_u16, vqaddq_s16, - vqaddq_u32, vqaddq_s32, vqaddq_u64, vqaddq_s64, - vaddq_f32) + std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16, + wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64, + wrap::vaddq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -585,15 +648,18 @@ namespace xsimd * sub * *******/ + WRAP_BINARY_INT(vsubq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type) + template = 0> batch sub(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::neon_dispatcher::binary dispatcher = { - std::make_tuple(vsubq_u8, vsubq_s8, vsubq_u16, vsubq_s16, - vsubq_u32, vsubq_s32, vsubq_u64, vsubq_s64, - vsubq_f32) + std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16, + wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64, + wrap::vsubq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -602,15 +668,17 @@ namespace xsimd * ssub * ********/ + WRAP_BINARY_INT(vqsubq, detail::identity_return_type) + template = 0> batch ssub(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::neon_dispatcher::binary dispatcher = { - std::make_tuple(vqsubq_u8, vqsubq_s8, vqsubq_u16, vqsubq_s16, - vqsubq_u32, vqsubq_s32, vqsubq_u64, vqsubq_s64, - vsubq_f32) + std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16, + wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64, + wrap::vsubq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -620,13 +688,17 @@ namespace xsimd * mul * *******/ + WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type) + template = 0> batch mul(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_dispatcher::binary dispatcher = { - std::make_tuple(vmulq_u8, vmulq_s8, vmulq_u16, vmulq_s16, vmulq_u32, vmulq_s32, vmulq_f32) + std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16, + wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -670,13 +742,17 @@ namespace xsimd * eq * ******/ + WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type) + template = 0> batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_comp_dispatcher::binary dispatcher = { - std::make_tuple(vceqq_u8, vceqq_s8, vceqq_u16, vceqq_s16, vceqq_u32, vceqq_s32, vceqq_f32) + std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16, + wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -688,7 +764,7 @@ namespace xsimd using dispatcher_type = detail::neon_comp_dispatcher_impl::binary; constexpr dispatcher_type dispatcher = { - std::make_tuple(vceqq_u8, vceqq_u16, vceqq_u32) + std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -709,13 +785,17 @@ namespace xsimd * lt * ******/ + WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type) + template = 0> batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_comp_dispatcher::binary dispatcher = { - std::make_tuple(vcltq_u8, vcltq_s8, vcltq_u16, vcltq_s16, vcltq_u32, vcltq_s32, vcltq_f32) + std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16, + wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -730,13 +810,17 @@ namespace xsimd * le * ******/ + WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type) + template = 0> batch_bool le(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_comp_dispatcher::binary dispatcher = { - std::make_tuple(vcleq_u8, vcleq_s8, vcleq_u16, vcleq_s16, vcleq_u32, vcleq_s32, vcleq_f32) + std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16, + wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -751,13 +835,17 @@ namespace xsimd * gt * ******/ + WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type) + template = 0> batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_comp_dispatcher::binary dispatcher = { - std::make_tuple(vcgtq_u8, vcgtq_s8, vcgtq_u16, vcgtq_s16, vcgtq_u32, vcgtq_s32, vcgtq_f32) + std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16, + wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -772,13 +860,17 @@ namespace xsimd * ge * ******/ + WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type) + template = 0> batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_comp_dispatcher::binary dispatcher = { - std::make_tuple(vcgeq_u8, vcgeq_s8, vcgeq_u16, vcgeq_s16, vcgeq_u32, vcgeq_s32, vcgeq_f32) + std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16, + wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -793,6 +885,8 @@ namespace xsimd * bitwise_and * ***************/ + WRAP_BINARY_INT(vandq, detail::identity_return_type) + namespace detail { inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) @@ -806,8 +900,8 @@ namespace xsimd { constexpr neon_dispatcher::binary dispatcher = { - std::make_tuple(vandq_u8, vandq_s8, vandq_u16, vandq_s16, - vandq_u32, vandq_s32, vandq_u64, vandq_s64, + std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16, + wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64, bitwise_and_f32) }; return dispatcher.apply(lhs, rhs); @@ -832,6 +926,8 @@ namespace xsimd * bitwise_or * **************/ + WRAP_BINARY_INT(vorrq, detail::identity_return_type) + namespace detail { inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) @@ -845,8 +941,8 @@ namespace xsimd { constexpr neon_dispatcher::binary dispatcher = { - std::make_tuple(vorrq_u8, vorrq_s8, vorrq_u16, vorrq_s16, - vorrq_u32, vorrq_s32, vorrq_u64, vorrq_s64, + std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16, + wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64, bitwise_or_f32) }; return dispatcher.apply(lhs, rhs); @@ -871,6 +967,8 @@ namespace xsimd * bitwise_xor * ***************/ + WRAP_BINARY_INT(veorq, detail::identity_return_type) + namespace detail { inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) @@ -884,8 +982,8 @@ namespace xsimd { constexpr neon_dispatcher::binary dispatcher = { - std::make_tuple(veorq_u8, veorq_s8, veorq_u16, veorq_s16, - veorq_u32, veorq_s32, veorq_u64, veorq_s64, + std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16, + wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64, bitwise_xor_f32) }; return dispatcher.apply(lhs, rhs); @@ -920,6 +1018,8 @@ namespace xsimd * bitwise_not * ***************/ + WRAP_UNARY_INT_EXCLUDING_64(vmvnq) + namespace detail { inline int64x2_t bitwise_not_s64(int64x2_t arg) @@ -942,8 +1042,8 @@ namespace xsimd { constexpr neon_dispatcher::unary dispatcher = { - std::make_tuple(vmvnq_u8, vmvnq_s8, vmvnq_u16, vmvnq_s16, - vmvnq_u32, vmvnq_s32, + std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16, + wrap::vmvnq_u32, wrap::vmvnq_s32, bitwise_not_u64, bitwise_not_s64, bitwise_not_f32) }; @@ -969,6 +1069,8 @@ namespace xsimd * bitwise_andnot * ******************/ + WRAP_BINARY_INT(vbicq, detail::identity_return_type) + namespace detail { inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) @@ -981,8 +1083,8 @@ namespace xsimd { constexpr detail::neon_dispatcher::binary dispatcher = { - std::make_tuple(vbicq_u8, vbicq_s8, vbicq_u16, vbicq_s16, - vbicq_u32, vbicq_s32, vbicq_u64, vbicq_s64, + std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16, + wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64, bitwise_andnot_f32) }; return dispatcher.apply(lhs, rhs); @@ -1007,14 +1109,17 @@ namespace xsimd * min * *******/ + WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vminq, detail::identity_return_type) + template = 0> batch min(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_dispatcher::binary dispatcher = { - std::make_tuple(vminq_u8, vminq_s8, vminq_u16, vminq_s16, - vminq_u32, vminq_s32, vminq_f32) + std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16, + wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -1029,14 +1134,17 @@ namespace xsimd * max * *******/ + WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type) + template = 0> batch max(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; constexpr detail::excluding_int64_dispatcher::binary dispatcher = { - std::make_tuple(vmaxq_u8, vmaxq_s8, vmaxq_u16, vmaxq_s16, - vmaxq_u32, vmaxq_s32, vmaxq_f32) + std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16, + wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } @@ -1051,6 +1159,13 @@ namespace xsimd * abs * *******/ + namespace wrap { + inline int8x16_t vabsq_s8 (int8x16_t a) { return ::vabsq_s8 (a); } + inline int16x8_t vabsq_s16(int16x8_t a) { return ::vabsq_s16(a); } + inline int32x4_t vabsq_s32(int32x4_t a) { return ::vabsq_s32(a); } + } + WRAP_UNARY_FLOAT(vabsq) + namespace detail { inline uint8x16_t abs_u8(uint8x16_t arg) @@ -1075,8 +1190,8 @@ namespace xsimd using register_type = typename batch::register_type; constexpr detail::excluding_int64_dispatcher::unary dispatcher = { - std::make_tuple(detail::abs_u8, vabsq_s8, detail::abs_u16, vabsq_s16, - detail::abs_u32, vabsq_s32, vabsq_f32) + std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16, + detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32) }; return dispatcher.apply(register_type(arg)); } @@ -1219,6 +1334,18 @@ namespace xsimd * select * **********/ + namespace wrap { + inline uint8x16_t vbslq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) { return ::vbslq_u8 (a, b, c); } + inline int8x16_t vbslq_s8 (uint8x16_t a, int8x16_t b, int8x16_t c) { return ::vbslq_s8 (a, b, c); } + inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return ::vbslq_u16(a, b, c); } + inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) { return ::vbslq_s16(a, b, c); } + inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return ::vbslq_u32(a, b, c); } + inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) { return ::vbslq_s32(a, b, c); } + inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return ::vbslq_u64(a, b, c); } + inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) { return ::vbslq_s64(a, b, c); } + inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return ::vbslq_f32(a, b, c); } + } + namespace detail { template @@ -1250,9 +1377,9 @@ namespace xsimd using register_type = typename batch::register_type; constexpr detail::neon_select_dispatcher dispatcher = { - std::make_tuple(vbslq_u8, vbslq_s8, vbslq_u16, vbslq_s16, - vbslq_u32, vbslq_s32, vbslq_u64, vbslq_s64, - vbslq_f32) + std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16, + wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64, + wrap::vbslq_f32) }; return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b)); } @@ -1987,6 +2114,31 @@ namespace xsimd * bitwise_cast * ****************/ + #define WRAP_CAST(SUFFIX, TYPE) \ + namespace wrap { \ + inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) { return ::vreinterpretq_##SUFFIX##_u8 (a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) { return ::vreinterpretq_##SUFFIX##_s8 (a); } \ + inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) { return ::vreinterpretq_##SUFFIX##_u16(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) { return ::vreinterpretq_##SUFFIX##_s16(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) { return ::vreinterpretq_##SUFFIX##_u32(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) { return ::vreinterpretq_##SUFFIX##_s32(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) { return ::vreinterpretq_##SUFFIX##_u64(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) { return ::vreinterpretq_##SUFFIX##_s64(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) { return ::vreinterpretq_##SUFFIX##_f32(a); } \ + } + + WRAP_CAST(u8, uint8x16_t) + WRAP_CAST(s8, int8x16_t) + WRAP_CAST(u16, uint16x8_t) + WRAP_CAST(s16, int16x8_t) + WRAP_CAST(u32, uint32x4_t) + WRAP_CAST(s32, int32x4_t) + WRAP_CAST(u64, uint64x2_t) + WRAP_CAST(s64, int64x2_t) + WRAP_CAST(f32, float32x4_t) + + #undef WRAP_CAST + namespace detail { template @@ -2039,16 +2191,6 @@ namespace xsimd uint32x4_t, int32x4_t, uint64x2_t, int64x2_t, float32x4_t>; - - inline uint8x16_t identity_u8(uint8x16_t arg) { return arg; } - inline int8x16_t identity_s8(int8x16_t arg) { return arg; } - inline uint16x8_t identity_u16(uint16x8_t arg) { return arg; } - inline int16x8_t identity_s16(int16x8_t arg) { return arg; } - inline uint32x4_t identity_u32(uint32x4_t arg) { return arg; } - inline int32x4_t identity_s32(int32x4_t arg) { return arg; } - inline uint64x2_t identity_u64(uint64x2_t arg) { return arg; } - inline int64x2_t identity_s64(int64x2_t arg) { return arg; } - inline float32x4_t identity_f32(float32x4_t arg) { return arg; } } template @@ -2056,33 +2198,33 @@ namespace xsimd { constexpr detail::neon_bitwise_caster caster = { std::make_tuple( - detail::make_bitwise_caster_impl(detail::identity_u8, vreinterpretq_u8_s8, vreinterpretq_u8_u16, vreinterpretq_u8_s16, - vreinterpretq_u8_u32, vreinterpretq_u8_s32, vreinterpretq_u8_u64, vreinterpretq_u8_s64, - vreinterpretq_u8_f32), - detail::make_bitwise_caster_impl(vreinterpretq_s8_u8, detail::identity_s8, vreinterpretq_s8_u16, vreinterpretq_s8_s16, - vreinterpretq_s8_u32, vreinterpretq_s8_s32, vreinterpretq_s8_u64, vreinterpretq_s8_s64, - vreinterpretq_s8_f32), - detail::make_bitwise_caster_impl(vreinterpretq_u16_u8, vreinterpretq_u16_s8, detail::identity_u16, vreinterpretq_u16_s16, - vreinterpretq_u16_u32, vreinterpretq_u16_s32, vreinterpretq_u16_u64, vreinterpretq_u16_s64, - vreinterpretq_u16_f32), - detail::make_bitwise_caster_impl(vreinterpretq_s16_u8, vreinterpretq_s16_s8, vreinterpretq_s16_u16, detail::identity_s16, - vreinterpretq_s16_u32, vreinterpretq_s16_s32, vreinterpretq_s16_u64, vreinterpretq_s16_s64, - vreinterpretq_s16_f32), - detail::make_bitwise_caster_impl(vreinterpretq_u32_u8, vreinterpretq_u32_s8, vreinterpretq_u32_u16, vreinterpretq_u32_s16, - detail::identity_u32, vreinterpretq_u32_s32, vreinterpretq_u32_u64, vreinterpretq_u32_s64, - vreinterpretq_u32_f32), - detail::make_bitwise_caster_impl(vreinterpretq_s32_u8, vreinterpretq_s32_s8, vreinterpretq_s32_u16, vreinterpretq_s32_s16, - vreinterpretq_s32_u32, detail::identity_s32, vreinterpretq_s32_u64, vreinterpretq_s32_s64, - vreinterpretq_s32_f32), - detail::make_bitwise_caster_impl(vreinterpretq_u64_u8, vreinterpretq_u64_s8, vreinterpretq_u64_u16, vreinterpretq_u64_s16, - vreinterpretq_u64_u32, vreinterpretq_u64_s32, detail::identity_u64, vreinterpretq_u64_s64, - vreinterpretq_u64_f32), - detail::make_bitwise_caster_impl(vreinterpretq_s64_u8, vreinterpretq_s64_s8, vreinterpretq_s64_u16, vreinterpretq_s64_s16, - vreinterpretq_s64_u32, vreinterpretq_s64_s32, vreinterpretq_s64_u64, detail::identity_s64, - vreinterpretq_s64_f32), - detail::make_bitwise_caster_impl(vreinterpretq_f32_u8, vreinterpretq_f32_s8, vreinterpretq_f32_u16, vreinterpretq_f32_s16, - vreinterpretq_f32_u32, vreinterpretq_f32_s32, vreinterpretq_f32_u64, vreinterpretq_f32_s64, - detail::identity_f32)) + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16, + wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64, + wrap::vreinterpretq_u8_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16, + wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64, + wrap::vreinterpretq_s8_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16, + wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64, + wrap::vreinterpretq_u16_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16, + wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64, + wrap::vreinterpretq_s16_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16, + wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64, + wrap::vreinterpretq_u32_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16, + wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64, + wrap::vreinterpretq_s32_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16, + wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64, + wrap::vreinterpretq_u64_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16, + wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64, + wrap::vreinterpretq_s64_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16, + wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64, + wrap::vreinterpretq_f32_f32)) }; using src_register_type = typename batch::register_type; using dst_register_type = typename batch::register_type; @@ -2152,4 +2294,11 @@ namespace xsimd } } +#undef WRAP_BINARY_INT_EXCLUDING_64 +#undef WRAP_BINARY_INT +#undef WRAP_BINARY_FLOAT +#undef WRAP_UNARY_INT_EXCLUDING_64 +#undef WRAP_UNARY_INT +#undef WRAP_UNARY_FLOAT + #endif diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index 0a22db16a..942f6ec9e 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -762,6 +762,24 @@ namespace xsimd * bitwise_cast * ****************/ + #define WRAP_CAST(SUFFIX, TYPE) \ + namespace wrap { \ + inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) { return ::vreinterpretq_f64_##SUFFIX(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) { return ::vreinterpretq_##SUFFIX##_f64(a); } \ + } + + WRAP_CAST(u8, uint8x16_t) + WRAP_CAST(s8, int8x16_t) + WRAP_CAST(u16, uint16x8_t) + WRAP_CAST(s16, int16x8_t) + WRAP_CAST(u32, uint32x4_t) + WRAP_CAST(s32, int32x4_t) + WRAP_CAST(u64, uint64x2_t) + WRAP_CAST(s64, int64x2_t) + WRAP_CAST(f32, float32x4_t) + + #undef WRAP_CAST + template batch bitwise_cast(batch const& arg, batch const&, requires_arch) { @@ -772,9 +790,9 @@ namespace xsimd uint64x2_t, int64x2_t, float32x4_t>; constexpr caster_type caster = { - std::make_tuple(vreinterpretq_f64_u8, vreinterpretq_f64_s8, vreinterpretq_f64_u16, vreinterpretq_f64_s16, - vreinterpretq_f64_u32, vreinterpretq_f64_s32, vreinterpretq_f64_u64, vreinterpretq_f64_s64, - vreinterpretq_f64_f32) + std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16, + wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64, + wrap::vreinterpretq_f64_f32) }; using register_type = typename batch::register_type; return caster.apply(register_type(arg)); @@ -808,9 +826,9 @@ namespace xsimd uint64x2_t, int64x2_t, float32x4_t>; constexpr caster_type caster = { - std::make_tuple(vreinterpretq_u8_f64, vreinterpretq_s8_f64, vreinterpretq_u16_f64, vreinterpretq_s16_f64, - vreinterpretq_u32_f64, vreinterpretq_s32_f64, vreinterpretq_u64_f64, vreinterpretq_s64_f64, - vreinterpretq_f32_f64) + std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64, + wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64, + wrap::vreinterpretq_f32_f64) }; using src_register_type = typename batch::register_type; using dst_register_type = typename batch::register_type;