From 5ba8c482dbfb8ee4e54ef6c9a47b0d00e38919e5 Mon Sep 17 00:00:00 2001 From: Alex Lindsay Date: Fri, 5 Dec 2025 21:49:25 -0700 Subject: [PATCH 1/5] Add bitwise_or API to Communicator --- src/parallel/include/timpi/communicator.h | 9 +++++++++ .../include/timpi/parallel_implementation.h | 14 ++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/parallel/include/timpi/communicator.h b/src/parallel/include/timpi/communicator.h index fa77314..e5147bf 100644 --- a/src/parallel/include/timpi/communicator.h +++ b/src/parallel/include/timpi/communicator.h @@ -464,6 +464,15 @@ class Communicator void maxloc(std::vector & r, std::vector & max_id) const; + /** + * Take a local variable and replace it with the bitwise_or of its values + * on all processors + */ + template + inline + void bitwise_or(T & r) const; + + /** * Take a local variable and replace it with the sum of it's values * on all processors. Containers are replaced element-wise. diff --git a/src/parallel/include/timpi/parallel_implementation.h b/src/parallel/include/timpi/parallel_implementation.h index 26bdc53..5ec622b 100644 --- a/src/parallel/include/timpi/parallel_implementation.h +++ b/src/parallel/include/timpi/parallel_implementation.h @@ -2643,6 +2643,20 @@ inline void Communicator::maxloc(std::vector & r, } } +template +inline void Communicator::bitwise_or(T & timpi_mpi_var(r)) const +{ + if (this->size() > 1) + { + TIMPI_LOG_SCOPE("bitwise_or(scalar)", "Parallel"); + + timpi_call_mpi + (TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, + StandardType(&r), OpFunction::bitwise_or(), + this->get())); + } +} + template inline void Communicator::sum(const T & r, From 72d27f455b2ec4383f139d21f2f02ae141764913 Mon Sep 17 00:00:00 2001 From: Alex Lindsay Date: Fri, 5 Dec 2025 22:10:56 -0700 Subject: [PATCH 2/5] Add other OpFunction Communicator implementations --- src/parallel/include/timpi/communicator.h | 47 +++++++++++++++++++ .../include/timpi/parallel_implementation.h | 36 ++++++++------ 2 files changed, 69 insertions(+), 14 deletions(-) diff --git a/src/parallel/include/timpi/communicator.h b/src/parallel/include/timpi/communicator.h index e5147bf..27e7696 100644 --- a/src/parallel/include/timpi/communicator.h +++ b/src/parallel/include/timpi/communicator.h @@ -464,6 +464,38 @@ class Communicator void maxloc(std::vector & r, std::vector & max_id) const; + /** + * Take a local variable and replace it with the product of its values + * on all processors + */ + template + inline + void product(T & r) const; + + /** + * Take a local variable and replace it with the logical_and of its values + * on all processors + */ + template + inline + void logical_and(T & r) const; + + /** + * Take a local variable and replace it with the bitwise_and of its values + * on all processors + */ + template + inline + void bitwise_and(T & r) const; + + /** + * Take a local variable and replace it with the logical_or of its values + * on all processors + */ + template + inline + void logical_or(T & r) const; + /** * Take a local variable and replace it with the bitwise_or of its values * on all processors @@ -472,6 +504,21 @@ class Communicator inline void bitwise_or(T & r) const; + /** + * Take a local variable and replace it with the logical_xor of its values + * on all processors + */ + template + inline + void logical_xor(T & r) const; + + /** + * Take a local variable and replace it with the bitwise_xor of its values + * on all processors + */ + template + inline + void bitwise_xor(T & r) const; /** * Take a local variable and replace it with the sum of it's values diff --git a/src/parallel/include/timpi/parallel_implementation.h b/src/parallel/include/timpi/parallel_implementation.h index 5ec622b..d12b6e4 100644 --- a/src/parallel/include/timpi/parallel_implementation.h +++ b/src/parallel/include/timpi/parallel_implementation.h @@ -2643,20 +2643,28 @@ inline void Communicator::maxloc(std::vector & r, } } -template -inline void Communicator::bitwise_or(T & timpi_mpi_var(r)) const -{ - if (this->size() > 1) - { - TIMPI_LOG_SCOPE("bitwise_or(scalar)", "Parallel"); - - timpi_call_mpi - (TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, - StandardType(&r), OpFunction::bitwise_or(), - this->get())); - } -} - +#define TIMPI_DEFINE_COMMUNICATOR_OP(OPNAME) \ +template \ +inline void Communicator::OPNAME(T & timpi_mpi_var(r)) const \ +{ \ + if (this->size() > 1) \ + { \ + TIMPI_LOG_SCOPE(#OPNAME "(scalar)", "Parallel"); \ + \ + timpi_call_mpi \ + (TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, \ + StandardType(&r), OpFunction::OPNAME(), \ + this->get())); \ + } \ +} + +TIMPI_DEFINE_COMMUNICATOR_OP(product) +TIMPI_DEFINE_COMMUNICATOR_OP(logical_and) +TIMPI_DEFINE_COMMUNICATOR_OP(bitwise_and) +TIMPI_DEFINE_COMMUNICATOR_OP(logical_or) +TIMPI_DEFINE_COMMUNICATOR_OP(bitwise_or) +TIMPI_DEFINE_COMMUNICATOR_OP(logical_xor) +TIMPI_DEFINE_COMMUNICATOR_OP(bitwise_xor) template inline void Communicator::sum(const T & r, From 09dc9cd819e5b6426ef7198af223ba3bcbd2cae4 Mon Sep 17 00:00:00 2001 From: Alex Lindsay Date: Sat, 6 Dec 2025 11:29:39 -0700 Subject: [PATCH 3/5] Absorb sum,max,min intro macro --- .../include/timpi/parallel_implementation.h | 49 ++----------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/src/parallel/include/timpi/parallel_implementation.h b/src/parallel/include/timpi/parallel_implementation.h index d12b6e4..3fe89d5 100644 --- a/src/parallel/include/timpi/parallel_implementation.h +++ b/src/parallel/include/timpi/parallel_implementation.h @@ -2198,22 +2198,6 @@ inline void Communicator::min(const T & r, -template -inline void Communicator::min(T & timpi_mpi_var(r)) const -{ - if (this->size() > 1) - { - TIMPI_LOG_SCOPE("min(scalar)", "Parallel"); - - timpi_call_mpi - (TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, - StandardType(&r), OpFunction::min(), - this->get())); - } -} - - - template inline void Communicator::min(std::vector & r) const { @@ -2374,20 +2358,6 @@ inline void Communicator::max(const T & r, } -template -inline void Communicator::max(T & timpi_mpi_var(r)) const -{ - if (this->size() > 1) - { - TIMPI_LOG_SCOPE("max(scalar)", "Parallel"); - - timpi_call_mpi - (TIMPI_ALLREDUCE (MPI_IN_PLACE, &r, 1, StandardType(&r), - OpFunction::max(), this->get())); - } -} - - template inline void Communicator::max(std::vector & r) const { @@ -2658,6 +2628,9 @@ inline void Communicator::OPNAME(T & timpi_mpi_var(r)) const \ } \ } +TIMPI_DEFINE_COMMUNICATOR_OP(sum) +TIMPI_DEFINE_COMMUNICATOR_OP(max) +TIMPI_DEFINE_COMMUNICATOR_OP(min) TIMPI_DEFINE_COMMUNICATOR_OP(product) TIMPI_DEFINE_COMMUNICATOR_OP(logical_and) TIMPI_DEFINE_COMMUNICATOR_OP(bitwise_and) @@ -2690,22 +2663,6 @@ inline void Communicator::sum(const T & r, } -template -inline void Communicator::sum(T & timpi_mpi_var(r)) const -{ - if (this->size() > 1) - { - TIMPI_LOG_SCOPE("sum()", "Parallel"); - - timpi_call_mpi - (TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, - StandardType(&r), - OpFunction::sum(), - this->get())); - } -} - - template inline void Communicator::sum(std::vector & r) const { From c7ff6e61be7df4084d01d1685a1dc9a2ce5c1695 Mon Sep 17 00:00:00 2001 From: Alex Lindsay Date: Sat, 6 Dec 2025 11:54:00 -0700 Subject: [PATCH 4/5] Also implement vector and scalar non-blocking And absorb existing sum,max,min implementations of those into macro --- src/parallel/include/timpi/communicator.h | 70 ++++++- .../parallel_communicator_specializations | 28 +++ .../include/timpi/parallel_implementation.h | 188 ++++-------------- 3 files changed, 135 insertions(+), 151 deletions(-) diff --git a/src/parallel/include/timpi/communicator.h b/src/parallel/include/timpi/communicator.h index 27e7696..9857fac 100644 --- a/src/parallel/include/timpi/communicator.h +++ b/src/parallel/include/timpi/communicator.h @@ -466,60 +466,116 @@ class Communicator /** * Take a local variable and replace it with the product of its values - * on all processors + * on all processors. Containers are replaced element-wise. */ template inline void product(T & r) const; + /** + * Non-blocking product of the local value \p r into \p o + * with the request \p req. + */ + template + inline + void product(const T & r, T & o, Request & req) const; + /** * Take a local variable and replace it with the logical_and of its values - * on all processors + * on all processors. Containers are replaced element-wise. */ template inline void logical_and(T & r) const; + /** + * Non-blocking logical_and of the local value \p r into \p o + * with the request \p req. + */ + template + inline + void logical_and(const T & r, T & o, Request & req) const; + /** * Take a local variable and replace it with the bitwise_and of its values - * on all processors + * on all processors. Containers are replaced element-wise. */ template inline void bitwise_and(T & r) const; + /** + * Non-blocking bitwise_and of the local value \p r into \p o + * with the request \p req. + */ + template + inline + void bitwise_and(const T & r, T & o, Request & req) const; + /** * Take a local variable and replace it with the logical_or of its values - * on all processors + * on all processors. Containers are replaced element-wise. */ template inline void logical_or(T & r) const; + /** + * Non-blocking logical_or of the local value \p r into \p o + * with the request \p req. + */ + template + inline + void logical_or(const T & r, T & o, Request & req) const; + /** * Take a local variable and replace it with the bitwise_or of its values - * on all processors + * on all processors. Containers are replaced element-wise. */ template inline void bitwise_or(T & r) const; + /** + * Non-blocking bitwise_or of the local value \p r into \p o + * with the request \p req. + */ + template + inline + void bitwise_or(const T & r, T & o, Request & req) const; + /** * Take a local variable and replace it with the logical_xor of its values - * on all processors + * on all processors. Containers are replaced element-wise. */ template inline void logical_xor(T & r) const; + /** + * Non-blocking logical_xor of the local value \p r into \p o + * with the request \p req. + */ + template + inline + void logical_xor(const T & r, T & o, Request & req) const; + /** * Take a local variable and replace it with the bitwise_xor of its values - * on all processors + * on all processors. Containers are replaced element-wise. */ template inline void bitwise_xor(T & r) const; + /** + * Non-blocking bitwise_xor of the local value \p r into \p o + * with the request \p req. + */ + template + inline + void bitwise_xor(const T & r, T & o, Request & req) const; + /** * Take a local variable and replace it with the sum of it's values * on all processors. Containers are replaced element-wise. diff --git a/src/parallel/include/timpi/parallel_communicator_specializations b/src/parallel/include/timpi/parallel_communicator_specializations index fd59c43..56b3485 100644 --- a/src/parallel/include/timpi/parallel_communicator_specializations +++ b/src/parallel/include/timpi/parallel_communicator_specializations @@ -98,6 +98,34 @@ inline void sum(std::unordered_map &r) const; + template + inline + void product(std::vector &r) const; + + template + inline + void logical_and(std::vector &r) const; + + template + inline + void bitwise_and(std::vector &r) const; + + template + inline + void logical_or(std::vector &r) const; + + template + inline + void bitwise_or(std::vector &r) const; + + template + inline + void logical_xor(std::vector &r) const; + + template + inline + void bitwise_xor(std::vector &r) const; + template inline void set_union(std::set &data, diff --git a/src/parallel/include/timpi/parallel_implementation.h b/src/parallel/include/timpi/parallel_implementation.h index 3fe89d5..947aad2 100644 --- a/src/parallel/include/timpi/parallel_implementation.h +++ b/src/parallel/include/timpi/parallel_implementation.h @@ -2174,48 +2174,6 @@ inline bool Communicator::semiverify(const std::vector * r) const - -template -inline void Communicator::min(const T & r, - T & o, - Request & req) const -{ - if (this->size() > 1) - { - TIMPI_LOG_SCOPE("min()", "Parallel"); - - timpi_call_mpi - (TIMPI_IALLREDUCE(&r, &o, 1, StandardType(&r), - OpFunction::min(), this->get(), - req.get())); - } - else - { - o = r; - req = Request::null_request; - } -} - - - -template -inline void Communicator::min(std::vector & r) const -{ - if (this->size() > 1 && !r.empty()) - { - TIMPI_LOG_SCOPE("min(vector)", "Parallel"); - - timpi_assert(this->verify(r.size())); - - timpi_call_mpi - (TIMPI_ALLREDUCE - (MPI_IN_PLACE, r.data(), cast_int(r.size()), - StandardType(r.data()), OpFunction::min(), - this->get())); - } -} - - template inline void Communicator::min(std::vector & r) const { @@ -2335,46 +2293,7 @@ inline void Communicator::minloc(std::vector & r, } } - -template -inline void Communicator::max(const T & r, - T & o, - Request & req) const -{ - if (this->size() > 1) - { - TIMPI_LOG_SCOPE("max()", "Parallel"); - - timpi_call_mpi - (TIMPI_IALLREDUCE(&r, &o, 1, StandardType(&r), - OpFunction::max(), this->get(), - req.get())); - } - else - { - o = r; - req = Request::null_request; - } -} - - -template -inline void Communicator::max(std::vector & r) const -{ - if (this->size() > 1 && !r.empty()) - { - TIMPI_LOG_SCOPE("max(vector)", "Parallel"); - - timpi_assert(this->verify(r.size())); - - timpi_call_mpi - (TIMPI_ALLREDUCE (MPI_IN_PLACE, r.data(), - cast_int(r.size()), - StandardType(r.data()), - OpFunction::max(), this->get())); - } -} - + template inline void Communicator::max(std::vector & r) const @@ -2613,73 +2532,54 @@ inline void Communicator::maxloc(std::vector & r, } } -#define TIMPI_DEFINE_COMMUNICATOR_OP(OPNAME) \ -template \ -inline void Communicator::OPNAME(T & timpi_mpi_var(r)) const \ -{ \ - if (this->size() > 1) \ - { \ +#define TIMPI_DEFINE_COMMUNICATOR_OPS(OPNAME) \ + template \ + inline void Communicator::OPNAME(T &timpi_mpi_var(r)) const { \ + if (this->size() > 1) { \ TIMPI_LOG_SCOPE(#OPNAME "(scalar)", "Parallel"); \ \ - timpi_call_mpi \ - (TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, \ - StandardType(&r), OpFunction::OPNAME(), \ - this->get())); \ + timpi_call_mpi(TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, StandardType(&r), \ + OpFunction::OPNAME(), this->get())); \ } \ -} - -TIMPI_DEFINE_COMMUNICATOR_OP(sum) -TIMPI_DEFINE_COMMUNICATOR_OP(max) -TIMPI_DEFINE_COMMUNICATOR_OP(min) -TIMPI_DEFINE_COMMUNICATOR_OP(product) -TIMPI_DEFINE_COMMUNICATOR_OP(logical_and) -TIMPI_DEFINE_COMMUNICATOR_OP(bitwise_and) -TIMPI_DEFINE_COMMUNICATOR_OP(logical_or) -TIMPI_DEFINE_COMMUNICATOR_OP(bitwise_or) -TIMPI_DEFINE_COMMUNICATOR_OP(logical_xor) -TIMPI_DEFINE_COMMUNICATOR_OP(bitwise_xor) - -template -inline void Communicator::sum(const T & r, - T & o, - Request & req) const -{ -#ifdef TIMPI_HAVE_MPI - if (this->size() > 1) - { - TIMPI_LOG_SCOPE("sum()", "Parallel"); - - timpi_call_mpi - (TIMPI_IALLREDUCE(&r, &o, 1, StandardType(&r), - OpFunction::sum(), this->get(), - req.get())); - } - else -#endif - { - o = r; - req = Request::null_request; - } -} - - -template -inline void Communicator::sum(std::vector & r) const -{ - if (this->size() > 1 && !r.empty()) - { - TIMPI_LOG_SCOPE("sum()", "Parallel"); + } \ + \ + template \ + inline void Communicator::OPNAME(std::vector &r) const { \ + if (this->size() > 1 && !r.empty()) { \ + TIMPI_LOG_SCOPE(#OPNAME "(vector)", "Parallel"); \ + \ + timpi_assert(this->verify(r.size())); \ + \ + timpi_call_mpi(TIMPI_ALLREDUCE( \ + MPI_IN_PLACE, r.data(), cast_int(r.size()), \ + StandardType(r.data()), OpFunction::OPNAME(), this->get())); \ + } \ + } \ + template \ + inline void Communicator::OPNAME(const T &r, T &o, Request &req) const { \ + if (this->size() > 1) { \ + TIMPI_LOG_SCOPE(#OPNAME "(scalar, nonblocking)", "Parallel"); \ + \ + timpi_call_mpi(TIMPI_IALLREDUCE(&r, &o, 1, StandardType(&r), \ + OpFunction::OPNAME(), this->get(), \ + req.get())); \ + } else { \ + o = r; \ + req = Request::null_request; \ + } \ + } - timpi_assert(this->verify(r.size())); +TIMPI_DEFINE_COMMUNICATOR_OPS(sum) +TIMPI_DEFINE_COMMUNICATOR_OPS(max) +TIMPI_DEFINE_COMMUNICATOR_OPS(min) +TIMPI_DEFINE_COMMUNICATOR_OPS(product) +TIMPI_DEFINE_COMMUNICATOR_OPS(logical_and) +TIMPI_DEFINE_COMMUNICATOR_OPS(bitwise_and) +TIMPI_DEFINE_COMMUNICATOR_OPS(logical_or) +TIMPI_DEFINE_COMMUNICATOR_OPS(bitwise_or) +TIMPI_DEFINE_COMMUNICATOR_OPS(logical_xor) +TIMPI_DEFINE_COMMUNICATOR_OPS(bitwise_xor) - timpi_call_mpi - (TIMPI_ALLREDUCE(MPI_IN_PLACE, r.data(), - cast_int(r.size()), - StandardType(r.data()), - OpFunction::sum(), - this->get())); - } -} // We still do function overloading for complex sums - in a perfect From 780c643acdaa82228fc60e8c7c9bda17afe73183 Mon Sep 17 00:00:00 2001 From: Alex Lindsay Date: Sat, 6 Dec 2025 11:55:49 -0700 Subject: [PATCH 5/5] Add blocking label to blocking all reduces --- src/parallel/include/timpi/parallel_implementation.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/parallel/include/timpi/parallel_implementation.h b/src/parallel/include/timpi/parallel_implementation.h index 947aad2..1e28a09 100644 --- a/src/parallel/include/timpi/parallel_implementation.h +++ b/src/parallel/include/timpi/parallel_implementation.h @@ -2293,7 +2293,7 @@ inline void Communicator::minloc(std::vector & r, } } - + template inline void Communicator::max(std::vector & r) const @@ -2532,11 +2532,11 @@ inline void Communicator::maxloc(std::vector & r, } } -#define TIMPI_DEFINE_COMMUNICATOR_OPS(OPNAME) \ +#define TIMPI_DEFINE_COMMUNICATOR_OPS(OPNAME) \ template \ inline void Communicator::OPNAME(T &timpi_mpi_var(r)) const { \ if (this->size() > 1) { \ - TIMPI_LOG_SCOPE(#OPNAME "(scalar)", "Parallel"); \ + TIMPI_LOG_SCOPE(#OPNAME "(scalar, blocking)", "Parallel"); \ \ timpi_call_mpi(TIMPI_ALLREDUCE(MPI_IN_PLACE, &r, 1, StandardType(&r), \ OpFunction::OPNAME(), this->get())); \ @@ -2546,7 +2546,7 @@ inline void Communicator::maxloc(std::vector & r, template \ inline void Communicator::OPNAME(std::vector &r) const { \ if (this->size() > 1 && !r.empty()) { \ - TIMPI_LOG_SCOPE(#OPNAME "(vector)", "Parallel"); \ + TIMPI_LOG_SCOPE(#OPNAME "(vector, blocking)", "Parallel"); \ \ timpi_assert(this->verify(r.size())); \ \