From 84993aa622070aae7b7bd112789fb8af57131e13 Mon Sep 17 00:00:00 2001
From: ledwards2225 <l.edwards.d@gmail.com>
Date: Thu, 22 Jun 2023 19:08:26 +0000
Subject: [PATCH 1/7] basic multithreading working

---
 .../composer/standard_honk_composer.test.cpp  |   3 +-
 .../honk/sumcheck/sumcheck_round.hpp          | 134 ++++++++++++++++--
 .../honk/sumcheck/sumcheck_round.test.cpp     |  35 +++++
 3 files changed, 156 insertions(+), 16 deletions(-)
diff --git a/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp b/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp
index 04ada2ef9b..76c632124f 100644
--- a/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp
+++ b/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp
@@ -391,7 +391,8 @@ TEST_F(StandardHonkComposerTests, SumcheckEvaluations)
         uint32_t b_idx = circuit_constructor.add_variable(b);
         uint32_t c_idx = circuit_constructor.add_variable(c);
         uint32_t d_idx = circuit_constructor.add_variable(d);
-        for (size_t i = 0; i < 16; i++) {
+        size_t num_iterations = 1 << 16;
+        for (size_t i = 0; i < num_iterations; i++) {
             circuit_constructor.create_add_gate(
                 { a_idx, b_idx, c_idx, fr::one(), fr::one(), fr::neg_one(), fr::zero() });
             circuit_constructor.create_add_gate(
diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
index a90d422e51..82733a0ed9 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -2,7 +2,9 @@
 #include "barretenberg/common/log.hpp"
 #include <array>
 #include <algorithm>
+#include <omp.h>
 #include <tuple>
+#include "barretenberg/common/thread.hpp"
 #include "polynomials/barycentric_data.hpp"
 #include "polynomials/univariate.hpp"
 #include "polynomials/pow.hpp"
@@ -120,7 +122,7 @@ template <typename Flavor> class SumcheckRound {
      * In practice, multivariates is one of ProverPolynomials or FoldedPolynomials.
      *
      */
-    void extend_edges(auto& multivariates, size_t edge_idx)
+    void extend_edges(auto& extended_edges, auto& multivariates, size_t edge_idx)
     {
         size_t univariate_idx = 0; // TODO(#391) zip
         for (auto& poly : multivariates) {
@@ -140,20 +142,82 @@ template <typename Flavor> class SumcheckRound {
                                                            const PowUnivariate<FF>& pow_univariate,
                                                            const FF alpha)
     {
-        // For each edge_idx = 2i, we need to multiply the whole contribution by zeta^{2^{2i}}
-        // This means that each univariate for each relation needs an extra multiplication.
-        FF pow_challenge = pow_univariate.partial_evaluation_constant;
-        for (size_t edge_idx = 0; edge_idx < round_size; edge_idx += 2) {
-            extend_edges(polynomials, edge_idx);
-
-            // Compute the i-th edge's univariate contribution,
-            // scale it by the pow polynomial's constant and zeta power "c_l ⋅ ζ_{l+1}ⁱ"
-            // and add it to the accumulators for Sˡ(Xₗ)
-            accumulate_relation_univariates<>(relation_parameters, pow_challenge);
-            // Update the pow polynomial's contribution c_l ⋅ ζ_{l+1}ⁱ for the next edge.
-            pow_challenge *= pow_univariate.zeta_pow_sqr;
+        // Precompute the vector of required powers of zeta
+        std::vector<FF> pow_challenges;
+        pow_challenges.resize(round_size >> 1);
+        pow_challenges[0] = pow_univariate.partial_evaluation_constant;
+        for (size_t i = 1; i < (round_size >> 1); ++i) {
+            pow_challenges[i] = pow_challenges[i - 1] * pow_univariate.zeta_pow_sqr;
         }
 
+        // TODO(luke): should we always use multiple threads but simply reduce the number of threads to ensure
+        // sufficient iterations per thread?
+        size_t num_threads = get_num_cpus();
+        size_t iterations_per_thread = round_size / num_threads;
+        size_t min_iterations_per_thread = 1 << 6;
+
+        // use multithreading only if operations per thread meets minimum threshold
+        bool use_multithreading = (iterations_per_thread >= min_iterations_per_thread);
+
+        if (use_multithreading) {
+            info("round size = ", round_size);
+
+            std::vector<RelationUnivariates> partial_univariate_accumulators;
+            partial_univariate_accumulators.resize(num_threads);
+            for (auto& accum : partial_univariate_accumulators) {
+                zero_univariates(accum);
+            }
+
+            std::vector<ExtendedEdges<MAX_RELATION_LENGTH>> partial_extended_edges;
+            partial_extended_edges.resize(num_threads);
+
+            omp_set_num_threads(int(num_threads));
+
+            // std::array<size_t, 4> idxs = { 3, 0, 2, 1 };
+            // for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx)
+            parallel_for(num_threads, [&](size_t thread_idx) {
+                // info("omp_get_num_threads() = ", omp_get_num_threads());
+                size_t start = thread_idx * iterations_per_thread;
+                size_t end = (thread_idx + 1) * iterations_per_thread;
+
+                // For each edge_idx = 2i, we need to multiply the whole contribution by zeta^{2^{2i}}
+                // This means that each univariate for each relation needs an extra multiplication.
+                for (size_t edge_idx = start; edge_idx < end; edge_idx += 2) {
+                    extend_edges(partial_extended_edges[thread_idx], polynomials, edge_idx);
+
+                    // Update the pow polynomial's contribution c_l ⋅ ζ_{l+1}ⁱ for the next edge.
+                    FF pow_challenge = pow_challenges[edge_idx >> 1];
+
+                    // Compute the i-th edge's univariate contribution,
+                    // scale it by the pow polynomial's constant and zeta power "c_l ⋅ ζ_{l+1}ⁱ"
+                    // and add it to the accumulators for Sˡ(Xₗ)
+                    accumulate_relation_univariates<>(partial_univariate_accumulators[thread_idx],
+                                                      partial_extended_edges[thread_idx],
+                                                      relation_parameters,
+                                                      pow_challenge);
+                }
+            }); // parallel_for
+
+            // Accumulate the partial univariate accumulators from each thread into the full accumulators
+            for (auto& partial_accum : partial_univariate_accumulators) {
+                add_nested_tuples(univariate_accumulators, partial_accum);
+            }
+        } else {
+            // For each edge_idx = 2i, we need to multiply the whole contribution by zeta^{2^{2i}}
+            // This means that each univariate for each relation needs an extra multiplication.
+            for (size_t edge_idx = 0; edge_idx < round_size; edge_idx += 2) {
+                extend_edges(extended_edges, polynomials, edge_idx);
+
+                // Update the pow polynomial's contribution c_l ⋅ ζ_{l+1}ⁱ for the next edge.
+                FF pow_challenge = pow_challenges[edge_idx >> 1];
+
+                // Compute the i-th edge's univariate contribution,
+                // scale it by the pow polynomial's constant and zeta power "c_l ⋅ ζ_{l+1}ⁱ"
+                // and add it to the accumulators for Sˡ(Xₗ)
+                accumulate_relation_univariates<>(
+                    univariate_accumulators, extended_edges, relation_parameters, pow_challenge);
+            }
+        }
         return batch_over_relations(alpha);
     }
 
@@ -238,14 +302,18 @@ template <typename Flavor> class SumcheckRound {
      * appropriate scaling factors, produces S_l.
      */
     template <size_t relation_idx = 0>
-    void accumulate_relation_univariates(const RelationParameters<FF>& relation_parameters, const FF& scaling_factor)
+    void accumulate_relation_univariates(RelationUnivariates& univariate_accumulators,
+                                         const auto& extended_edges,
+                                         const RelationParameters<FF>& relation_parameters,
+                                         const FF& scaling_factor)
     {
         std::get<relation_idx>(relations).add_edge_contribution(
             std::get<relation_idx>(univariate_accumulators), extended_edges, relation_parameters, scaling_factor);
 
         // Repeat for the next relation.
         if constexpr (relation_idx + 1 < NUM_RELATIONS) {
-            accumulate_relation_univariates<relation_idx + 1>(relation_parameters, scaling_factor);
+            accumulate_relation_univariates<relation_idx + 1>(
+                univariate_accumulators, extended_edges, relation_parameters, scaling_factor);
         }
     }
 
@@ -401,5 +469,41 @@ template <typename Flavor> class SumcheckRound {
             apply_to_tuple_of_arrays<Operation, idx + 1>(operation, tuple);
         }
     }
+
+    /**
+     * @brief Componentwise addition of two tuples
+     * @details Used for adding tuples of Univariates but in general works for any object for which += is
+     * defined. The result is stored in the first tuple.
+     *
+     * @tparam T Type of the elements contained in the tuples
+     * @param tuple_1 First summand. Result stored in this tuple
+     * @param tuple_2 Second summand
+     */
+    template <typename... T>
+    static constexpr void add_tuples(std::tuple<T...>& tuple_1, const std::tuple<T...>& tuple_2)
+    {
+        [&]<std::size_t... I>(std::index_sequence<I...>) { ((std::get<I>(tuple_1) += std::get<I>(tuple_2)), ...); }
+        (std::make_index_sequence<sizeof...(T)>{});
+    }
+
+    /**
+     * @brief Componentwise addition of nested tuples (tuples of tuples)
+     * @details Used for summing tuples of tuples of Univariates. Needed for Sumcheck multithreading. Each thread
+     * accumulates realtion contributions across a portion of the hypecube and then the results are accumulated into a
+     * single nested tuple.
+     *
+     * @tparam Tuple
+     * @tparam Index Index into outer tuple
+     * @param tuple_1 First nested tuple summand. Result stored here
+     * @param tuple_2 Second summand
+     */
+    template <typename Tuple, std::size_t Index = 0>
+    static constexpr void add_nested_tuples(Tuple& tuple_1, const Tuple& tuple_2)
+    {
+        if constexpr (Index < std::tuple_size<Tuple>::value) {
+            add_tuples(std::get<Index>(tuple_1), std::get<Index>(tuple_2));
+            add_nested_tuples<Tuple, Index + 1>(tuple_1, tuple_2);
+        }
+    }
 };
 } // namespace proof_system::honk::sumcheck
diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp
index c5e7260be4..cdc1db9219 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp
@@ -361,4 +361,39 @@ TEST(SumcheckRound, TuplesOfEvaluationArrays)
     EXPECT_EQ(std::get<1>(tuple_of_arrays)[1], 0);
 }
 
+/**
+ * @brief Test utility functions for adding two tuples of tuples of Univariates
+ *
+ */
+TEST(SumcheckRound, AddTuplesOfTuplesOfUnivariates)
+{
+    using Flavor = proof_system::honk::flavor::Standard;
+    using FF = typename Flavor::FF;
+
+    // Define some arbitrary univariates
+    Univariate<FF, 2> univariate_1({ 1, 2 });
+    Univariate<FF, 2> univariate_2({ 2, 4 });
+    Univariate<FF, 3> univariate_3({ 3, 4, 5 });
+
+    Univariate<FF, 2> univariate_4({ 3, 6 });
+    Univariate<FF, 2> univariate_5({ 8, 1 });
+    Univariate<FF, 3> univariate_6({ 3, 7, 1 });
+
+    Univariate<FF, 2> expected_sum_1 = univariate_1 + univariate_4;
+    Univariate<FF, 2> expected_sum_2 = univariate_2 + univariate_5;
+    Univariate<FF, 3> expected_sum_3 = univariate_3 + univariate_6;
+
+    // Construct two tuples of tuples
+    auto tuple_of_tuples_1 =
+        std::make_tuple(std::make_tuple(univariate_1), std::make_tuple(univariate_2, univariate_3));
+    auto tuple_of_tuples_2 =
+        std::make_tuple(std::make_tuple(univariate_4), std::make_tuple(univariate_5, univariate_6));
+
+    SumcheckRound<Flavor>::add_nested_tuples(tuple_of_tuples_1, tuple_of_tuples_2);
+
+    EXPECT_EQ(std::get<0>(std::get<0>(tuple_of_tuples_1)), expected_sum_1);
+    EXPECT_EQ(std::get<0>(std::get<1>(tuple_of_tuples_1)), expected_sum_2);
+    EXPECT_EQ(std::get<1>(std::get<1>(tuple_of_tuples_1)), expected_sum_3);
+}
+
 } // namespace test_sumcheck_round

From 5f3c23e11a8f738491e1d6b8b1da5f0336184fc1 Mon Sep 17 00:00:00 2001
From: ledwards2225 <l.edwards.d@gmail.com>
Date: Thu, 22 Jun 2023 21:27:42 +0000
Subject: [PATCH 2/7] cleanup

---
 .../compare_honk_branch_vs_baseline.sh        |  1 +
 .../honk/proof_system/ultra_prover.cpp        | 21 +++++
 .../honk/sumcheck/sumcheck_round.hpp          | 91 +++++++------------
 3 files changed, 57 insertions(+), 56 deletions(-)

diff --git a/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_branch_vs_baseline.sh b/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_branch_vs_baseline.sh
index 71bf3ee5b9..612514c414 100755
--- a/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_branch_vs_baseline.sh
+++ b/cpp/src/barretenberg/benchmark/honk_bench/compare_honk_branch_vs_baseline.sh
@@ -35,6 +35,7 @@ bin/$BENCH_TARGET --benchmark_format=json > $BRANCH_RESULTS
 # Checkout baseline branch, run benchmarks, save results in json format
 echo -e "\nConfiguring and building $BENCH_TARGET in $BASELINE_BRANCH branch..\n"
 git checkout master > /dev/null
+cd $BASE_DIR
 rm -rf $BUILD_DIR
 cmake --preset bench > /dev/null && cmake --build --preset bench --target $BENCH_TARGET
 cd build-bench
diff --git a/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp b/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp
index ba61525755..2f90b42736 100644
--- a/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp
+++ b/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp
@@ -1,6 +1,7 @@
 #include "ultra_prover.hpp"
 #include <algorithm>
 #include <cstddef>
+#include "barretenberg/common/timer.hpp"
 #include "barretenberg/honk/proof_system/prover_library.hpp"
 #include "barretenberg/honk/sumcheck/sumcheck.hpp"
 #include <array>
@@ -292,47 +293,67 @@ template <UltraFlavor Flavor> plonk::proof& UltraProver_<Flavor>::export_proof()
 
 template <UltraFlavor Flavor> plonk::proof& UltraProver_<Flavor>::construct_proof()
 {
+    Timer timer;
     // Add circuit size public input size and public inputs to transcript.
     execute_preamble_round();
+    info("execute_preamble_round: ", timer.seconds(), "s");
 
+    Timer timer2;
     // Compute first three wire commitments
     execute_wire_commitments_round();
     queue.process_queue();
+    info("execute_wire_commitments_round: ", timer2.seconds(), "s");
 
+    Timer timer3;
     // Compute sorted list accumulator and commitment
     execute_sorted_list_accumulator_round();
     queue.process_queue();
+    info("execute_sorted_list_accumulator_round: ", timer3.seconds(), "s");
 
+    Timer timer4;
     // Fiat-Shamir: beta & gamma
     // Compute grand product(s) and commitments.
     execute_grand_product_computation_round();
     queue.process_queue();
+    info("execute_grand_product_computation_round: ", timer4.seconds(), "s");
 
+    Timer timer5;
     // Fiat-Shamir: alpha
     // Run sumcheck subprotocol.
     execute_relation_check_rounds();
+    info("execute_relation_check_rounds: ", timer5.seconds(), "s");
 
+    Timer timer6;
     // Fiat-Shamir: rho
     // Compute Fold polynomials and their commitments.
     execute_univariatization_round();
     queue.process_queue();
+    info("execute_univariatization_round: ", timer6.seconds(), "s");
 
+    Timer timer7;
     // Fiat-Shamir: r
     // Compute Fold evaluations
     execute_pcs_evaluation_round();
+    info("execute_pcs_evaluation_round: ", timer7.seconds(), "s");
 
+    Timer timer8;
     // Fiat-Shamir: nu
     // Compute Shplonk batched quotient commitment Q
     execute_shplonk_batched_quotient_round();
     queue.process_queue();
+    info("execute_shplonk_batched_quotient_round: ", timer8.seconds(), "s");
 
+    Timer timer9;
     // Fiat-Shamir: z
     // Compute partial evaluation Q_z
     execute_shplonk_partial_evaluation_round();
+    info("execute_shplonk_partial_evaluation_round: ", timer9.seconds(), "s");
 
+    Timer timer10;
     // Fiat-Shamir: z
     // Compute PCS opening proof (either KZG quotient commitment or IPA opening proof)
     execute_final_pcs_round();
+    info("execute_final_pcs_round: ", timer10.seconds(), "s");
 
     return export_proof();
 }
diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
index 82733a0ed9..338a314282 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -80,8 +80,6 @@ template <typename Flavor> class SumcheckRound {
     RelationUnivariates univariate_accumulators;
     RelationEvaluations relation_evaluations;
 
-    ExtendedEdges<MAX_RELATION_LENGTH> extended_edges;
-
     // TODO(#224)(Cody): this should go away
     BarycentricData<FF, 2, MAX_RELATION_LENGTH> barycentric_2_to_max = BarycentricData<FF, 2, MAX_RELATION_LENGTH>();
 
@@ -150,63 +148,36 @@ template <typename Flavor> class SumcheckRound {
             pow_challenges[i] = pow_challenges[i - 1] * pow_univariate.zeta_pow_sqr;
         }
 
-        // TODO(luke): should we always use multiple threads but simply reduce the number of threads to ensure
-        // sufficient iterations per thread?
-        size_t num_threads = get_num_cpus();
-        size_t iterations_per_thread = round_size / num_threads;
-        size_t min_iterations_per_thread = 1 << 6;
-
-        // use multithreading only if operations per thread meets minimum threshold
-        bool use_multithreading = (iterations_per_thread >= min_iterations_per_thread);
+        // Determine number of threads for multithreading.
+        // Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based
+        // on a specified minimum number of iterations per thread. This eventually leads to the use of a single thread.
+        size_t max_num_threads = get_num_cpus();   // number of available threads
+        size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
+        size_t desired_num_threads = round_size / min_iterations_per_thread;
+        size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
+        num_threads = num_threads > 0 ? num_threads : 1;                     // ensure num threads is >= 1
+        size_t iterations_per_thread = round_size / num_threads;             // actual iterations per thread
+
+        // Constuct univariate accumulator containers; one per thread
+        std::vector<RelationUnivariates> thread_univariate_accumulators;
+        thread_univariate_accumulators.resize(num_threads);
+        for (auto& accum : thread_univariate_accumulators) {
+            zero_univariates(accum);
+        }
 
-        if (use_multithreading) {
-            info("round size = ", round_size);
+        // Constuct extended edge containers; one per thread
+        std::vector<ExtendedEdges<MAX_RELATION_LENGTH>> extended_edges;
+        extended_edges.resize(num_threads);
 
-            std::vector<RelationUnivariates> partial_univariate_accumulators;
-            partial_univariate_accumulators.resize(num_threads);
-            for (auto& accum : partial_univariate_accumulators) {
-                zero_univariates(accum);
-            }
+        // Accumulate the contribution from each sub-relation accross each edge of the hyper-cube
+        parallel_for(num_threads, [&](size_t thread_idx) {
+            size_t start = thread_idx * iterations_per_thread;
+            size_t end = (thread_idx + 1) * iterations_per_thread;
 
-            std::vector<ExtendedEdges<MAX_RELATION_LENGTH>> partial_extended_edges;
-            partial_extended_edges.resize(num_threads);
-
-            omp_set_num_threads(int(num_threads));
-
-            // std::array<size_t, 4> idxs = { 3, 0, 2, 1 };
-            // for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx)
-            parallel_for(num_threads, [&](size_t thread_idx) {
-                // info("omp_get_num_threads() = ", omp_get_num_threads());
-                size_t start = thread_idx * iterations_per_thread;
-                size_t end = (thread_idx + 1) * iterations_per_thread;
-
-                // For each edge_idx = 2i, we need to multiply the whole contribution by zeta^{2^{2i}}
-                // This means that each univariate for each relation needs an extra multiplication.
-                for (size_t edge_idx = start; edge_idx < end; edge_idx += 2) {
-                    extend_edges(partial_extended_edges[thread_idx], polynomials, edge_idx);
-
-                    // Update the pow polynomial's contribution c_l ⋅ ζ_{l+1}ⁱ for the next edge.
-                    FF pow_challenge = pow_challenges[edge_idx >> 1];
-
-                    // Compute the i-th edge's univariate contribution,
-                    // scale it by the pow polynomial's constant and zeta power "c_l ⋅ ζ_{l+1}ⁱ"
-                    // and add it to the accumulators for Sˡ(Xₗ)
-                    accumulate_relation_univariates<>(partial_univariate_accumulators[thread_idx],
-                                                      partial_extended_edges[thread_idx],
-                                                      relation_parameters,
-                                                      pow_challenge);
-                }
-            }); // parallel_for
-
-            // Accumulate the partial univariate accumulators from each thread into the full accumulators
-            for (auto& partial_accum : partial_univariate_accumulators) {
-                add_nested_tuples(univariate_accumulators, partial_accum);
-            }
-        } else {
             // For each edge_idx = 2i, we need to multiply the whole contribution by zeta^{2^{2i}}
             // This means that each univariate for each relation needs an extra multiplication.
-            for (size_t edge_idx = 0; edge_idx < round_size; edge_idx += 2) {
-                extend_edges(extended_edges, polynomials, edge_idx);
+            for (size_t edge_idx = start; edge_idx < end; edge_idx += 2) {
+                extend_edges(extended_edges[thread_idx], polynomials, edge_idx);
 
                 // Update the pow polynomial's contribution c_l ⋅ ζ_{l+1}ⁱ for the next edge.
                 FF pow_challenge = pow_challenges[edge_idx >> 1];
@@ -214,10 +185,18 @@ template <typename Flavor> class SumcheckRound {
                 // Compute the i-th edge's univariate contribution,
                 // scale it by the pow polynomial's constant and zeta power "c_l ⋅ ζ_{l+1}ⁱ"
                 // and add it to the accumulators for Sˡ(Xₗ)
-                accumulate_relation_univariates<>(
-                    univariate_accumulators, extended_edges, relation_parameters, pow_challenge);
+                accumulate_relation_univariates<>(thread_univariate_accumulators[thread_idx],
+                                                  extended_edges[thread_idx],
+                                                  relation_parameters,
+                                                  pow_challenge);
             }
+        }); // parallel_for
+
+        // Accumulate the per-thread univariate accumulators into a single accumulator
+        for (auto& accumulators : thread_univariate_accumulators) {
+            add_nested_tuples(univariate_accumulators, accumulators);
         }
+        // Batch the univariate contributions from each sub-relation to obtain the round univariate
         return batch_over_relations(alpha);
     }
 

From 61691e5bd1dc7081278e7a6f306ec5dac8c178dc Mon Sep 17 00:00:00 2001
From: ledwards2225 <l.edwards.d@gmail.com>
Date: Thu, 22 Jun 2023 21:45:19 +0000
Subject: [PATCH 3/7] remove omp header

---
 cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
index 338a314282..994f5680aa 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -2,7 +2,6 @@
 #include "barretenberg/common/log.hpp"
 #include <array>
 #include <algorithm>
-#include <omp.h>
 #include <tuple>
 #include "barretenberg/common/thread.hpp"
 #include "polynomials/barycentric_data.hpp"

From 336cabc11533e063abaa56679489ed2348caa765 Mon Sep 17 00:00:00 2001
From: ledwards2225 <l.edwards.d@gmail.com>
Date: Thu, 22 Jun 2023 22:53:42 +0000
Subject: [PATCH 4/7] fix bug in sumcheck round test

---
 cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp
index cdc1db9219..30fbbe543e 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.test.cpp
@@ -36,7 +36,7 @@ static Univariate<FF, max_relation_length> compute_round_univariate(
     const RelationParameters<FF>& relation_parameters,
     const FF alpha)
 {
-    size_t round_size = 1;
+    size_t round_size = 2;
     // Improvement(Cody): This is ugly? Maye supply some/all of this data through "flavor" class?
     auto round = SumcheckRound<Flavor>(round_size);
 

From 5b76ea49264c4c130d57909b16fa9a5a98608793 Mon Sep 17 00:00:00 2001
From: ledwards2225 <l.edwards.d@gmail.com>
Date: Fri, 23 Jun 2023 15:01:35 +0000
Subject: [PATCH 5/7] remove debug code

---
 .../honk/proof_system/ultra_prover.cpp        | 21 -------------------
 .../honk/sumcheck/sumcheck_round.hpp          |  7 ++++---
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp b/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp
index 2f90b42736..ba61525755 100644
--- a/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp
+++ b/cpp/src/barretenberg/honk/proof_system/ultra_prover.cpp
@@ -1,7 +1,6 @@
 #include "ultra_prover.hpp"
 #include <algorithm>
 #include <cstddef>
-#include "barretenberg/common/timer.hpp"
 #include "barretenberg/honk/proof_system/prover_library.hpp"
 #include "barretenberg/honk/sumcheck/sumcheck.hpp"
 #include <array>
@@ -293,67 +292,47 @@ template <UltraFlavor Flavor> plonk::proof& UltraProver_<Flavor>::export_proof()
 
 template <UltraFlavor Flavor> plonk::proof& UltraProver_<Flavor>::construct_proof()
 {
-    Timer timer;
     // Add circuit size public input size and public inputs to transcript.
     execute_preamble_round();
-    info("execute_preamble_round: ", timer.seconds(), "s");
 
-    Timer timer2;
     // Compute first three wire commitments
     execute_wire_commitments_round();
     queue.process_queue();
-    info("execute_wire_commitments_round: ", timer2.seconds(), "s");
 
-    Timer timer3;
     // Compute sorted list accumulator and commitment
     execute_sorted_list_accumulator_round();
     queue.process_queue();
-    info("execute_sorted_list_accumulator_round: ", timer3.seconds(), "s");
 
-    Timer timer4;
     // Fiat-Shamir: beta & gamma
     // Compute grand product(s) and commitments.
     execute_grand_product_computation_round();
     queue.process_queue();
-    info("execute_grand_product_computation_round: ", timer4.seconds(), "s");
 
-    Timer timer5;
     // Fiat-Shamir: alpha
     // Run sumcheck subprotocol.
     execute_relation_check_rounds();
-    info("execute_relation_check_rounds: ", timer5.seconds(), "s");
 
-    Timer timer6;
     // Fiat-Shamir: rho
     // Compute Fold polynomials and their commitments.
     execute_univariatization_round();
     queue.process_queue();
-    info("execute_univariatization_round: ", timer6.seconds(), "s");
 
-    Timer timer7;
     // Fiat-Shamir: r
     // Compute Fold evaluations
     execute_pcs_evaluation_round();
-    info("execute_pcs_evaluation_round: ", timer7.seconds(), "s");
 
-    Timer timer8;
     // Fiat-Shamir: nu
     // Compute Shplonk batched quotient commitment Q
     execute_shplonk_batched_quotient_round();
     queue.process_queue();
-    info("execute_shplonk_batched_quotient_round: ", timer8.seconds(), "s");
 
-    Timer timer9;
     // Fiat-Shamir: z
     // Compute partial evaluation Q_z
     execute_shplonk_partial_evaluation_round();
-    info("execute_shplonk_partial_evaluation_round: ", timer9.seconds(), "s");
 
-    Timer timer10;
     // Fiat-Shamir: z
     // Compute PCS opening proof (either KZG quotient commitment or IPA opening proof)
     execute_final_pcs_round();
-    info("execute_final_pcs_round: ", timer10.seconds(), "s");
 
     return export_proof();
 }
diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
index 994f5680aa..c8c4b0c7f0 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -150,7 +150,8 @@ template <typename Flavor> class SumcheckRound {
         // Determine number of threads for multithreading.
         // Note: Multithreading is "on" for every round but we reduce the number of threads from the max available based
         // on a specified minimum number of iterations per thread. This eventually leads to the use of a single thread.
-        size_t max_num_threads = get_num_cpus();   // number of available threads
+        // For now we use a power of 2 number of threads simply to ensure the round size is evenly divided.
+        size_t max_num_threads = get_num_cpus_pow2(); // number of available threads (power of 2)
         size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
         size_t desired_num_threads = round_size / min_iterations_per_thread;
         size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified
@@ -189,9 +190,9 @@ template <typename Flavor> class SumcheckRound {
                                                   relation_parameters,
                                                   pow_challenge);
             }
-        }); // parallel_for
+        });
 
-        // Accumulate the per-thread univariate accumulators into a single accumulator
+        // Accumulate the per-thread univariate accumulators into a single set of accumulators
         for (auto& accumulators : thread_univariate_accumulators) {
             add_nested_tuples(univariate_accumulators, accumulators);
         }

From 3fa8623fb76d2c477ffad74ebd03596ac0fc0265 Mon Sep 17 00:00:00 2001
From: ledwards2225 <l.edwards.d@gmail.com>
Date: Fri, 23 Jun 2023 15:17:15 +0000
Subject: [PATCH 6/7] some cleanup

---
 .../honk/composer/standard_honk_composer.test.cpp             | 3 +--
 cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp         | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp b/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp
index 76c632124f..04ada2ef9b 100644
--- a/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp
+++ b/cpp/src/barretenberg/honk/composer/standard_honk_composer.test.cpp
@@ -391,8 +391,7 @@ TEST_F(StandardHonkComposerTests, SumcheckEvaluations)
         uint32_t b_idx = circuit_constructor.add_variable(b);
         uint32_t c_idx = circuit_constructor.add_variable(c);
         uint32_t d_idx = circuit_constructor.add_variable(d);
-        size_t num_iterations = 1 << 16;
-        for (size_t i = 0; i < num_iterations; i++) {
+        for (size_t i = 0; i < 16; i++) {
             circuit_constructor.create_add_gate(
                 { a_idx, b_idx, c_idx, fr::one(), fr::one(), fr::neg_one(), fr::zero() });
             circuit_constructor.create_add_gate(
diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
index c8c4b0c7f0..16ede729a1 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -140,6 +140,7 @@ template <typename Flavor> class SumcheckRound {
                                                            const FF alpha)
     {
         // Precompute the vector of required powers of zeta
+        // TODO(luke): Parallelize this
         std::vector<FF> pow_challenges;
         pow_challenges.resize(round_size >> 1);
         pow_challenges[0] = pow_univariate.partial_evaluation_constant;
@@ -320,6 +321,9 @@ template <typename Flavor> class SumcheckRound {
     }
 
   public:
+    // TODO(luke): Potentially make RelationUnivarites (tuple of tuples of Univariates) a class and make these utility
+    // functions class methods. Alternatively, move all of these tuple utilities (and the ones living elsewhere) to
+    // their own module.
     /**
      * Utility methods for tuple of tuples of Univariates
      */

From a8588fde1aa0391146a89b1f94dbf57cf4f0c180 Mon Sep 17 00:00:00 2001
From: ledwards2225 <l.edwards.d@gmail.com>
Date: Fri, 23 Jun 2023 19:15:39 +0000
Subject: [PATCH 7/7] resize

---
 cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
index 16ede729a1..6c81dc30bc 100644
--- a/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
+++ b/cpp/src/barretenberg/honk/sumcheck/sumcheck_round.hpp
@@ -141,8 +141,7 @@ template <typename Flavor> class SumcheckRound {
     {
         // Precompute the vector of required powers of zeta
         // TODO(luke): Parallelize this
-        std::vector<FF> pow_challenges;
-        pow_challenges.resize(round_size >> 1);
+        std::vector<FF> pow_challenges(round_size >> 1);
         pow_challenges[0] = pow_univariate.partial_evaluation_constant;
         for (size_t i = 1; i < (round_size >> 1); ++i) {
             pow_challenges[i] = pow_challenges[i - 1] * pow_univariate.zeta_pow_sqr;
@@ -160,8 +159,7 @@ template <typename Flavor> class SumcheckRound {
         size_t iterations_per_thread = round_size / num_threads;             // actual iterations per thread
 
         // Constuct univariate accumulator containers; one per thread
-        std::vector<RelationUnivariates> thread_univariate_accumulators;
-        thread_univariate_accumulators.resize(num_threads);
+        std::vector<RelationUnivariates> thread_univariate_accumulators(num_threads);
         for (auto& accum : thread_univariate_accumulators) {
             zero_univariates(accum);
         }