diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 341fbde732e2..9e13e3b446aa 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -108,7 +108,8 @@ if use_highway 'src/highway/hwy/abort.cc', 'src/highway/hwy/per_target.cc', 'src/highway/hwy/targets.cc', - 'src/highway/hwy/print.cc' + 'src/highway/hwy/print.cc', + 'src/highway/hwy/aligned_allocator.cc' ], cpp_args: '-DTOOLCHAIN_MISS_ASM_HWCAP_H', include_directories: ['src/highway'], @@ -989,12 +990,12 @@ foreach gen_mtargets : [ ], [ 'loops_unary_complex.dispatch.h', - src_file.process('src/umath/loops_unary_complex.dispatch.c.src'), + 'src/umath/loops_unary_complex.dispatch.cpp', [ AVX512F, [AVX2, FMA3], SSE2, ASIMD, NEON, VSX3, VSX2, - VXE, VX, + VXE, VX, RVV ] ], [ diff --git a/numpy/_core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/_core/src/umath/loops_unary_complex.dispatch.c.src index 052ad464c7a8..c5699200742d 100644 --- a/numpy/_core/src/umath/loops_unary_complex.dispatch.c.src +++ b/numpy/_core/src/umath/loops_unary_complex.dispatch.c.src @@ -90,7 +90,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute) npy_intp len = dimensions[0]; npy_intp ssrc = steps[0] / sizeof(@ftype@); npy_intp sdst = steps[1] / sizeof(@ftype@); - if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) && npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst) && steps[0] % sizeof(@ftype@) == 0 diff --git a/numpy/_core/src/umath/loops_unary_complex.dispatch.cpp b/numpy/_core/src/umath/loops_unary_complex.dispatch.cpp new file mode 100644 index 000000000000..e9762e9ddcba --- /dev/null +++ b/numpy/_core/src/umath/loops_unary_complex.dispatch.cpp @@ -0,0 +1,176 @@ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#define PY_SSIZE_T_CLEAN +#include +#include +#include "numpy/ndarraytypes.h" +#include "numpy/npy_common.h" +#include "numpy/npy_math.h" +#include "numpy/utils.h" +#include "fast_loop_macros.h" +#include "loops_utils.h" +#include +#include +#include + +namespace hn = hwy::HWY_NAMESPACE; + +inline float Chypot(float x, float y){ + return npy_hypotf(x, y); +} + +inline double Chypot(double x, double y){ + return npy_hypot(x, y); +} + +template +HWY_ATTR void SuperCabsolute(char** args, + npy_intp const* dimensions, + npy_intp const* steps) { + npy_intp len = dimensions[0]; + npy_intp ssrc = steps[0] / sizeof(T); + npy_intp sdst = steps[1] / sizeof(T); + if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) && + steps[0] % sizeof(T) == 0 && steps[1] % sizeof(T) == 0) { + const T* src = (T*)args[0]; + T* dst = (T*)args[1]; + const hn::ScalableTag d; + const int vstep = hn::Lanes(d); + const int wstep = vstep * 2; + // const int hstep = vstep / 2; + + using TI = hwy::MakeSigned; + const hn::Rebind> di; + auto indices = hwy::AllocateAligned(vstep); + auto load_index = hn::Mul(hn::Iota(di, 0), hn::Set(di, ssrc)); + auto store_index = hn::Mul(hn::Iota(di, 0), hn::Set(di, sdst)); + + using vec_f = hn::Vec; + + // TODO: Optimize HWYComplexabsolute this by + // NPY_FINLINE npyv_@sfx@ + // simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im) + // { + // const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@); + // const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@); + + // re = npyv_abs_@sfx@(re); + // im = npyv_abs_@sfx@(im); + // /* + // * If real or imag = INF, then convert it to inf + j*inf + // * Handles: inf + j*nan, nan + j*inf + // */ + // npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf); + // npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf); + // im = npyv_select_@sfx@(re_infmask, inf, im); + // re = npyv_select_@sfx@(im_infmask, inf, re); + // /* + // * If real or imag = NAN, then convert it to nan + j*nan + // * Handles: x + j*nan, nan + j*x + // */ + // npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re); + // npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im); + // im = npyv_select_@sfx@(re_nnanmask, im, nan); + // re = npyv_select_@sfx@(im_nnanmask, re, nan); + + // npyv_@sfx@ larger = npyv_max_@sfx@(re, im); + // npyv_@sfx@ smaller = npyv_min_@sfx@(im, re); + // /* + // * Calculate div_mask to prevent 0./0. and inf/inf operations in div + // */ + // npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@()); + // npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf); + // npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask)); + + // npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger); + // npyv_@sfx@ hypot = npyv_sqrt_@sfx@( + // npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@) + // )); + // return npyv_mul_@sfx@(hypot, larger); + // } + auto HWYComplexabsolute = [d](vec_f re, vec_f im) { + const auto inf = hn::Set(d, std::is_floating_point::value ? NPY_INFINITYF : NPY_INFINITY); + const auto nan = hn::Set(d, std::is_floating_point::value ? NPY_NANF : NPY_NAN); + re = hn::Abs(re); + im = hn::Abs(im); + + /* + * If real or imag = INF, then convert it to inf + j*inf + * Handles: inf + j*nan, nan + j*inf + */ + auto re_infmask = hn::IsInf(re); + auto im_infmask = hn::IsInf(im); + im = hn::IfThenElse(re_infmask, inf, im); + re = hn::IfThenElse(im_infmask, inf, re); + /* + * If real or imag = NAN, then convert it to nan + j*nan + * Handles: x + j*nan, nan + j*x + */ + auto re_nanmask = hn::IsNaN(re); + auto im_nanmask = hn::IsNaN(im); + im = hn::IfThenElse(re_nanmask, nan, im); + re = hn::IfThenElse(im_nanmask, nan, re); + + auto larger = hn::Max(re, im); + auto smaller = hn::Min(re, im); + /* + * Calculate div_mask to prevent 0./0. and inf/inf operations in div + */ + auto zeromask = hn::Eq(larger, hn::Zero(d)); + auto infmask = hn::IsInf(larger); + auto divmask = hn::Not(hn::Or(infmask, zeromask)); + auto one = hn::Set(d, 1); + auto div = hn::Div(smaller, hn::IfThenElse(divmask, larger, one)); + auto ratio = hn::IfThenElseZero(divmask, div); + + auto hypot = hn::Sqrt(hn::MulAdd(ratio, ratio, hn::Set(d, 1.0))); + auto result = hn::Mul(hypot, larger); + return result; + }; + + if (ssrc == 2 && sdst == 1) { + for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) { + vec_f re, im; + hn::LoadInterleaved2(d, src, re, im); + auto r = HWYComplexabsolute(re, im); + hn::StoreU(r, d, dst); + } + } else { + for (; len >= vstep; + len -= vstep, src += ssrc * vstep, dst += sdst * vstep) { + auto re = hn::GatherIndex(d, src, load_index); + auto im = hn::GatherIndex(d, src + 1, load_index); + auto r = HWYComplexabsolute(re, im); + hn::ScatterIndex(r, d, dst, store_index); + } + } + for (; len > 0; len -= vstep, src += ssrc * vstep, dst += sdst * vstep) { + auto re = hn::GatherIndexN(d, src, load_index, len); + auto im = hn::GatherIndexN(d, src + 1, load_index, len); + auto r = HWYComplexabsolute(re, im); + hn::ScatterIndexN(r, d, dst, store_index, len); + } + } else { + UNARY_LOOP { + const T re = ((T*)ip1)[0]; + const T im = ((T*)ip1)[1]; + *((T*)op1) = Chypot(re, im); + } + } +} + +extern "C" { +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + SuperCabsolute(args, dimensions, steps); +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_absolute) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + SuperCabsolute(args, dimensions, steps); +} +} \ No newline at end of file