diff --git a/cpp/src/barretenberg/crypto/pedersen_commitment/c_bind.cpp b/cpp/src/barretenberg/crypto/pedersen_commitment/c_bind.cpp
index 1b58491c3c..f503481ce7 100644
--- a/cpp/src/barretenberg/crypto/pedersen_commitment/c_bind.cpp
+++ b/cpp/src/barretenberg/crypto/pedersen_commitment/c_bind.cpp
@@ -30,7 +30,6 @@ WASM_EXPORT void pedersen_plookup_compress_fields(uint8_t const* left, uint8_t c
     barretenberg::fr::serialize_to_buffer(r, result);
 }
 
-
 WASM_EXPORT void pedersen__compress(uint8_t const* inputs_buffer, uint8_t* output)
 {
     std::vector<grumpkin::fq> to_compress;
diff --git a/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp b/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp
index 10adacf4ce..9b657280ad 100644
--- a/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp
+++ b/cpp/src/barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp
@@ -5,6 +5,16 @@
 namespace crypto {
 namespace pedersen_commitment {
 
+/**
+ * @brief Converts input uint8_t buffers into vector of field elements. Used to hash the Transcript in a SNARK-friendly
+ * manner for recursive circuits.
+ *
+ * `buffer` is an unstructured byte array we want to convert these into field elements
+ * prior to hashing. We do this by splitting buffer into 31-byte chunks.
+ *
+ * @param buffer
+ * @return std::vector<grumpkin::fq>
+ */
 inline std::vector<grumpkin::fq> convert_buffer_to_field(const std::vector<uint8_t>& input)
 {
     const size_t num_bytes = input.size();
diff --git a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp
index 639dfa2440..8e3aa99ef1 100644
--- a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp
+++ b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.cpp
@@ -105,16 +105,16 @@ grumpkin::fq compress_native(const std::vector<std::pair<grumpkin::fq, generator
 /**
  * Given an arbitrary length of bytes, convert them to fields and compress the result using the default generators.
  */
-grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input)
+grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input, const size_t hash_index)
 {
     const auto elements = convert_buffer_to_field(input);
-    grumpkin::fq result_fq = compress_native(elements);
+    grumpkin::fq result_fq = compress_native(elements, hash_index);
     return result_fq;
 }
 
-grumpkin::fq compress_native(const std::vector<uint8_t>& input)
+grumpkin::fq compress_native(const std::vector<uint8_t>& input, const size_t hash_index)
 {
-    return compress_native_buffer_to_field(input);
+    return compress_native_buffer_to_field(input, hash_index);
 }
 
 } // namespace pedersen_commitment
diff --git a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp
index d7275aa6ac..0600e13b52 100644
--- a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp
+++ b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen.hpp
@@ -22,7 +22,7 @@ template <size_t T> grumpkin::fq compress_native(const std::array<grumpkin::fq,
     return commit_native(converted).x;
 }
 
-grumpkin::fq compress_native(const std::vector<uint8_t>& input);
+grumpkin::fq compress_native(const std::vector<uint8_t>& input, const size_t hash_index = 0);
 
 grumpkin::fq compress_native(const std::vector<std::pair<grumpkin::fq, generators::generator_index_t>>& input_pairs);
 
diff --git a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp
index e2333acfa6..5e6288e8df 100644
--- a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp
+++ b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.cpp
@@ -28,11 +28,12 @@ grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& i
     const size_t num_inputs = inputs.size();
 
     grumpkin::fq result = (pedersen_iv_table[iv]).x;
-    for (size_t i = 0; i < num_inputs; i++) {
+    result = hash_pair(result, num_inputs);
+    for (size_t i = 0; i < num_inputs - 1; i++) {
         result = hash_pair(result, inputs[i]);
     }
 
-    return (hash_single(result, false) + hash_single(grumpkin::fq(num_inputs), true));
+    return (hash_single(result, false) + hash_single(inputs[num_inputs - 1], true));
 }
 
 grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& inputs, const std::vector<size_t>& ivs)
@@ -46,7 +47,8 @@ grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& i
     const size_t num_inputs = inputs.size();
 
     grumpkin::fq result = (pedersen_iv_table[0]).x;
-    for (size_t i = 0; i < 2 * num_inputs; i++) {
+    result = hash_pair(result, num_inputs);
+    for (size_t i = 0; i < 2 * num_inputs - 1; i++) {
         if ((i & 1) == 0) {
             grumpkin::fq iv_result = (pedersen_iv_table[ivs[i >> 1]]).x;
             result = hash_pair(result, iv_result);
@@ -54,8 +56,7 @@ grumpkin::g1::element merkle_damgard_compress(const std::vector<grumpkin::fq>& i
             result = hash_pair(result, inputs[i >> 1]);
         }
     }
-
-    return (hash_single(result, false) + hash_single(grumpkin::fq(num_inputs), true));
+    return (hash_single(result, false) + hash_single(inputs[num_inputs - 1], true));
 }
 
 grumpkin::g1::element merkle_damgard_tree_compress(const std::vector<grumpkin::fq>& inputs,
@@ -111,16 +112,16 @@ grumpkin::fq compress_native(const std::vector<grumpkin::fq>& inputs, const std:
     return commit_native(inputs, hash_indices).x;
 }
 
-grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input)
+grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input, const size_t hash_index)
 {
     const auto elements = convert_buffer_to_field(input);
-    grumpkin::fq result_fq = compress_native(elements);
+    grumpkin::fq result_fq = compress_native(elements, hash_index);
     return result_fq;
 }
 
-std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input)
+std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input, const size_t hash_index)
 {
-    const auto result_fq = compress_native_buffer_to_field(input);
+    const auto result_fq = compress_native_buffer_to_field(input, hash_index);
     uint256_t result_u256(result_fq);
     const size_t num_bytes = input.size();
 
diff --git a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp
index 0f99b13fbb..b77fac9688 100644
--- a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp
+++ b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.hpp
@@ -13,9 +13,9 @@ grumpkin::g1::element merkle_damgard_tree_compress(const std::vector<grumpkin::f
 
 grumpkin::fq compress_native(const std::vector<grumpkin::fq>& inputs, const size_t hash_index = 0);
 grumpkin::fq compress_native(const std::vector<grumpkin::fq>& inputs, const std::vector<size_t>& hash_indices);
-std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input);
+std::vector<uint8_t> compress_native(const std::vector<uint8_t>& input, const size_t hash_index = 0);
 
-grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input);
+grumpkin::fq compress_native_buffer_to_field(const std::vector<uint8_t>& input, const size_t hash_index = 0);
 
 template <size_t T> grumpkin::fq compress_native(const std::array<grumpkin::fq, T>& inputs)
 {
diff --git a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp
index 82d0b4f7ed..a06f5cea58 100644
--- a/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp
+++ b/cpp/src/barretenberg/crypto/pedersen_commitment/pedersen_lookup.test.cpp
@@ -157,7 +157,9 @@ TEST(pedersen_lookup, merkle_damgard_compress)
 
     const auto result = crypto::pedersen_commitment::lookup::merkle_damgard_compress(inputs, iv);
 
-    fq intermediate = (grumpkin::g1::affine_one * fr(iv + 1)).x;
+    auto iv_hash = compute_expected((grumpkin::g1::affine_one * fr(iv + 1)).x, 0);
+    auto length = compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2));
+    fq intermediate = affine_element(iv_hash + length).x;
     for (size_t i = 0; i < m; i++) {
         intermediate =
             affine_element(compute_expected(intermediate, 0) +
@@ -165,10 +167,7 @@ TEST(pedersen_lookup, merkle_damgard_compress)
                 .x;
     }
 
-    EXPECT_EQ(affine_element(result).x,
-              affine_element(compute_expected(intermediate, 0) +
-                             compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2)))
-                  .x);
+    EXPECT_EQ(affine_element(result).x, intermediate);
 }
 
 TEST(pedersen_lookup, merkle_damgard_compress_multiple_iv)
@@ -188,7 +187,11 @@ TEST(pedersen_lookup, merkle_damgard_compress_multiple_iv)
     const auto result = crypto::pedersen_commitment::lookup::merkle_damgard_compress(inputs, ivs);
 
     const size_t initial_iv = 0;
-    fq intermediate = (grumpkin::g1::affine_one * fr(initial_iv + 1)).x;
+    auto iv_hash = compute_expected((grumpkin::g1::affine_one * fr(initial_iv + 1)).x, 0);
+
+    auto length = compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2));
+    fq intermediate = affine_element(iv_hash + length).x;
+
     for (size_t i = 0; i < 2 * m; i++) {
         if ((i & 1) == 0) {
             const auto iv = (grumpkin::g1::affine_one * fr(ivs[i >> 1] + 1)).x;
@@ -204,10 +207,7 @@ TEST(pedersen_lookup, merkle_damgard_compress_multiple_iv)
         }
     }
 
-    EXPECT_EQ(affine_element(result).x,
-              affine_element(compute_expected(intermediate, 0) +
-                             compute_expected(fq(m), (crypto::pedersen_hash::lookup::NUM_PEDERSEN_TABLES / 2)))
-                  .x);
+    EXPECT_EQ(affine_element(result).x, intermediate);
 }
 
 TEST(pedersen_lookup, merkle_damgard_tree_compress)
diff --git a/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp b/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp
index e17087aa88..1073ec7825 100644
--- a/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp
+++ b/cpp/src/barretenberg/honk/composer/ultra_honk_composer.hpp
@@ -371,11 +371,12 @@ class UltraHonkComposer {
     };
     // std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
     //     const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const UltraCircuitConstructor::non_native_field_witnesses& input,
         const bool range_constrain_quotient_and_remainder = true)
     {
-        return circuit_constructor.queue_non_native_field_multiplication(input, range_constrain_quotient_and_remainder);
+        return circuit_constructor.evaluate_non_native_field_multiplication(input,
+                                                                            range_constrain_quotient_and_remainder);
     };
     // std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses&
     // input); typedef std::pair<uint32_t, barretenberg::fr> scaled_witness; typedef std::tuple<scaled_witness,
diff --git a/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp b/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp
index 3653be64d1..09b3373e7f 100644
--- a/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp
+++ b/cpp/src/barretenberg/honk/composer/ultra_honk_composer.test.cpp
@@ -649,6 +649,7 @@ TEST(UltraHonkComposer, non_native_field_multiplication)
 
     fq a = fq::random_element();
     fq b = fq::random_element();
+
     uint256_t modulus = fq::modulus;
 
     uint1024_t a_big = uint512_t(uint256_t(a));
@@ -692,7 +693,7 @@ TEST(UltraHonkComposer, non_native_field_multiplication)
     proof_system::UltraCircuitConstructor::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = composer.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = composer.evaluate_non_native_field_multiplication(inputs);
     composer.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     prove_and_verify(composer, /*expected_result=*/true);
diff --git a/cpp/src/barretenberg/honk/proof_system/prover.hpp b/cpp/src/barretenberg/honk/proof_system/prover.hpp
index 7c7d1d0bb8..477ad349c5 100644
--- a/cpp/src/barretenberg/honk/proof_system/prover.hpp
+++ b/cpp/src/barretenberg/honk/proof_system/prover.hpp
@@ -15,7 +15,8 @@ namespace proof_system::honk {
 
 // We won't compile this class with honk::flavor::Ultra, but we will like want to compile it (at least for testing)
 // with a flavor that uses the curve Grumpkin, or a flavor that does/does not have zk, etc.
-template <typename T> concept StandardFlavor = IsAnyOf<T, honk::flavor::Standard>;
+template <typename T>
+concept StandardFlavor = IsAnyOf<T, honk::flavor::Standard>;
 
 template <StandardFlavor Flavor> class StandardProver_ {
 
diff --git a/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp b/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp
index bcf665711c..9bbe7314f2 100644
--- a/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp
+++ b/cpp/src/barretenberg/honk/proof_system/ultra_prover.hpp
@@ -13,7 +13,8 @@ namespace proof_system::honk {
 
 // We won't compile this class with honk::flavor::Standard, but we will like want to compile it (at least for testing)
 // with a flavor that uses the curve Grumpkin, or a flavor that does/does not have zk, etc.
-template <typename T> concept UltraFlavor = IsAnyOf<T, honk::flavor::Ultra>;
+template <typename T>
+concept UltraFlavor = IsAnyOf<T, honk::flavor::Ultra>;
 template <UltraFlavor Flavor> class UltraProver_ {
 
     using FF = typename Flavor::FF;
diff --git a/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp b/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp
index aa7e516609..2b81096c42 100644
--- a/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp
+++ b/cpp/src/barretenberg/join_split_example/proofs/join_split/join_split.test.cpp
@@ -806,11 +806,12 @@ TEST_F(join_split_tests, test_0_input_notes_and_detect_circuit_change)
 
     // The below part detects any changes in the join-split circuit
 
-    constexpr uint32_t CIRCUIT_GATE_COUNT = 185573;
+    constexpr uint32_t CIRCUIT_GATE_COUNT = 183834;
     constexpr uint32_t GATES_NEXT_POWER_OF_TWO = 524288;
-    const uint256_t VK_HASH("13eb88883e80efb9bf306af2962cd1a49e9fa1b0bfb2d4b563b95217a17bcc74");
+    const uint256_t VK_HASH("5c2e0fe914dbbf23d6bac6ae4db9a7e43d98c0b9d71c9200208dbce24a815c6e");
 
     auto number_of_gates_js = result.number_of_gates;
+    std::cout << get_verification_key()->sha256_hash() << std::endl;
     auto vk_hash_js = get_verification_key()->sha256_hash();
 
     if (!CIRCUIT_CHANGE_EXPECTED) {
diff --git a/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp b/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp
index 53c8ff8f90..703e037e64 100644
--- a/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp
+++ b/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.hpp
@@ -380,13 +380,14 @@ class UltraPlonkComposer {
     };
     // std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
     //     const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const UltraCircuitConstructor::non_native_field_witnesses& input,
         const bool range_constrain_quotient_and_remainder = true)
     {
-        return circuit_constructor.queue_non_native_field_multiplication(input, range_constrain_quotient_and_remainder);
+        return circuit_constructor.evaluate_non_native_field_multiplication(input,
+                                                                            range_constrain_quotient_and_remainder);
     };
-    // std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses&
+    // std::array<uint32_t, 2> queue_partial_non_native_field_multiplication(const non_native_field_witnesses&
     // input); typedef std::pair<uint32_t, barretenberg::fr> scaled_witness; typedef std::tuple<scaled_witness,
     // scaled_witness, barretenberg::fr> add_simple; std::array<uint32_t, 5> evaluate_non_native_field_subtraction(
     //     add_simple limb0,
diff --git a/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp b/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp
index a5f6e09822..fe95dfd298 100644
--- a/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp
+++ b/cpp/src/barretenberg/plonk/composer/splitting_tmp/ultra_plonk_composer.test.cpp
@@ -781,7 +781,7 @@ TEST(ultra_plonk_composer_splitting_tmp, non_native_field_multiplication)
     UltraCircuitConstructor::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = composer.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = composer.evaluate_non_native_field_multiplication(inputs);
     composer.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     auto prover = composer.create_prover();
diff --git a/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp b/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
index 1f3e9fb3ed..ce0f4eac7b 100644
--- a/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
+++ b/cpp/src/barretenberg/plonk/composer/ultra_composer.cpp
@@ -1138,7 +1138,30 @@ std::vector<uint32_t> UltraComposer::decompose_into_default_range(const uint32_t
         const auto limb_idx = add_variable(sublimbs[i]);
         sublimb_indices.emplace_back(limb_idx);
         if ((i == sublimbs.size() - 1) && has_remainder_bits) {
-            create_new_range_constraint(limb_idx, last_limb_range);
+            if ((target_range_bitnum - last_limb_size) < DEFAULT_PLOOKUP_RANGE_CUTOFF_BITNUM) {
+                // we don't want to make a new range table.
+                // X = limb, L = last limb range, K = sublimb mask. L < X
+                // we want X <= L
+                // i.e. L - X >= 0 and L - X <= K
+                // equivalent to saying L - X <= K
+                // D = L - X
+                // D + X - L
+                barretenberg::fr diff = uint256_t(last_limb_range) - get_variable(limb_idx);
+                uint32_t diff_idx = add_variable(diff);
+                create_add_gate({
+                    .a = limb_idx,
+                    .b = zero_idx,
+                    .c = diff_idx,
+                    .a_scaling = 1,
+                    .b_scaling = 0,
+                    .c_scaling = 1,
+                    .const_scaling = -barretenberg::fr(last_limb_range),
+                });
+                create_new_range_constraint(diff_idx, sublimb_mask);
+                create_new_range_constraint(limb_idx, sublimb_mask);
+            } else {
+                create_new_range_constraint(limb_idx, last_limb_range);
+            }
         } else {
             create_new_range_constraint(limb_idx, sublimb_mask);
         }
@@ -1860,22 +1883,18 @@ std::array<uint32_t, 2> UltraComposer::decompose_non_native_field_double_width_l
 }
 
 /**
- * @brief Queue up non-native field multiplication data.
+ * @brief Process a non-native field multiplication data.
  *
- * @details The data queued represents a non-native field multiplication identity a * b = q * p + r,
+ * @details The data represents a non-native field multiplication identity a * b = q * p + r,
  * where a, b, q, r are all emulated non-native field elements that are each split across 4 distinct witness variables.
  *
- * Without this queue some functions, such as proof_system::plonk::stdlib::element::double_montgomery_ladder, would
- * duplicate non-native field operations, which can be quite expensive. We queue up these operations, and remove
- * duplicates in the circuit finishing stage of the proving key computation.
- *
  * The non-native field modulus, p, is a circuit constant
  *
  * The return value are the witness indices of the two remainder limbs `lo_1, hi_2`
  *
  * N.B.: This method does NOT evaluate the prime field component of non-native field multiplications.
  **/
-std::array<uint32_t, 2> UltraComposer::queue_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraComposer::evaluate_non_native_field_multiplication(
     const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder)
 {
 
@@ -1903,10 +1922,11 @@ std::array<uint32_t, 2> UltraComposer::queue_non_native_field_multiplication(
         get_variable(input.r[2]),
         get_variable(input.r[3]),
     };
-
     constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
     constexpr barretenberg::fr LIMB_SHIFT_2 = uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_SHIFT_3 = uint256_t(1) << (3 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    constexpr barretenberg::fr LIMB_RSHIFT =
+        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_RSHIFT_2 =
         barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
 
@@ -1955,68 +1975,114 @@ std::array<uint32_t, 2> UltraComposer::queue_non_native_field_multiplication(
         range_constrain_two_limbs(input.q[2], input.q[3]);
     }
 
-    // Add witnesses into the multiplication cache
-    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
-    cached_non_native_field_multiplication cache_entry{
-        .a = input.a,
-        .b = input.b,
-        .q = input.q,
-        .r = input.r,
-        .cross_terms = { lo_0_idx, lo_1_idx, hi_0_idx, hi_1_idx, hi_2_idx, hi_3_idx },
-        .neg_modulus = input.neg_modulus,
-    };
-    cached_non_native_field_multiplications.emplace_back(cache_entry);
+    // product gate 1
+    // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
+    create_big_add_gate({ input.q[0],
+                          input.q[1],
+                          input.r[1],
+                          lo_1_idx,
+                          input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
+                          input.neg_modulus[0] * LIMB_SHIFT,
+                          -LIMB_SHIFT,
+                          -LIMB_SHIFT.sqr(),
+                          0 },
+                        true);
+
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[0]);
+    w_4.emplace_back(lo_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
+    ++num_gates;
+    w_l.emplace_back(input.a[0]);
+    w_r.emplace_back(input.b[0]);
+    w_o.emplace_back(input.a[3]);
+    w_4.emplace_back(input.b[3]);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
+    ++num_gates;
+    w_l.emplace_back(input.a[2]);
+    w_r.emplace_back(input.b[2]);
+    w_o.emplace_back(input.r[3]);
+    w_4.emplace_back(hi_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
+    ++num_gates;
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[2]);
+    w_4.emplace_back(hi_1_idx);
+    apply_aux_selectors(AUX_SELECTORS::NONE);
+    ++num_gates;
+
+    /**
+     * product gate 6
+     *
+     * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
+     *
+     **/
+    create_big_add_gate(
+        {
+            input.q[2],
+            input.q[3],
+            lo_1_idx,
+            hi_1_idx,
+            -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
+            -input.neg_modulus[0] * LIMB_SHIFT,
+            -1,
+            -1,
+            0,
+        },
+        true);
+
+    /**
+     * product gate 7
+     *
+     * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
+     **/
+    create_big_add_gate({
+        hi_3_idx,
+        input.q[0],
+        input.q[1],
+        hi_2_idx,
+        -1,
+        input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
+        input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
+        LIMB_RSHIFT_2,
+        0,
+    });
 
     return std::array<uint32_t, 2>{ lo_1_idx, hi_3_idx };
 }
 
 /**
  * @brief Called in `compute_proving_key` when finalizing circuit.
- * Iterates over the cached_non_native_field_multiplication objects,
+ * Iterates over the cached_partial_non_native_field_multiplication objects,
  * removes duplicates, and instantiates the remainder as constraints`
  */
 void UltraComposer::process_non_native_field_multiplications()
 {
-    std::sort(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
-
-    auto last =
-        std::unique(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
+    for (size_t i = 0; i < cached_partial_non_native_field_multiplications.size(); ++i) {
+        auto& c = cached_partial_non_native_field_multiplications[i];
+        for (size_t j = 0; j < 5; ++j) {
+            c.a[j] = real_variable_index[c.a[j]];
+            c.b[j] = real_variable_index[c.b[j]];
+        }
+    }
+    std::sort(cached_partial_non_native_field_multiplications.begin(),
+              cached_partial_non_native_field_multiplications.end());
 
-    auto it = cached_non_native_field_multiplications.begin();
+    auto last = std::unique(cached_partial_non_native_field_multiplications.begin(),
+                            cached_partial_non_native_field_multiplications.end());
 
-    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
-    constexpr barretenberg::fr LIMB_RSHIFT =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
-    constexpr barretenberg::fr LIMB_RSHIFT_2 =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
+    auto it = cached_partial_non_native_field_multiplications.begin();
 
     // iterate over the cached items and create constraints
     while (it != last) {
         const auto input = *it;
-        const uint32_t lo_0_idx = input.cross_terms.lo_0_idx;
-        const uint32_t lo_1_idx = input.cross_terms.lo_1_idx;
-        const uint32_t hi_0_idx = input.cross_terms.hi_0_idx;
-        const uint32_t hi_1_idx = input.cross_terms.hi_1_idx;
-        const uint32_t hi_2_idx = input.cross_terms.hi_2_idx;
-        const uint32_t hi_3_idx = input.cross_terms.hi_3_idx;
-
-        // product gate 1
-        // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
-        create_big_add_gate({ input.q[0],
-                              input.q[1],
-                              input.r[1],
-                              lo_1_idx,
-                              input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
-                              input.neg_modulus[0] * LIMB_SHIFT,
-                              -LIMB_SHIFT,
-                              -LIMB_SHIFT.sqr(),
-                              0 },
-                            true);
 
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[0]);
-        w_4.emplace_back(lo_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.lo_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
         ++num_gates;
         w_l.emplace_back(input.a[0]);
@@ -2027,65 +2093,34 @@ void UltraComposer::process_non_native_field_multiplications()
         ++num_gates;
         w_l.emplace_back(input.a[2]);
         w_r.emplace_back(input.b[2]);
-        w_o.emplace_back(input.r[3]);
-        w_4.emplace_back(hi_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
         ++num_gates;
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[2]);
-        w_4.emplace_back(hi_1_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_1);
         apply_aux_selectors(AUX_SELECTORS::NONE);
         ++num_gates;
-
-        /**
-         * product gate 6
-         *
-         * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
-         *
-         **/
-        create_big_add_gate(
-            {
-                input.q[2],
-                input.q[3],
-                lo_1_idx,
-                hi_1_idx,
-                -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
-                -input.neg_modulus[0] * LIMB_SHIFT,
-                -1,
-                -1,
-                0,
-            },
-            true);
-
-        /**
-         * product gate 7
-         *
-         * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
-         **/
-        create_big_add_gate({
-            hi_3_idx,
-            input.q[0],
-            input.q[1],
-            hi_2_idx,
-            -1,
-            input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
-            input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
-            LIMB_RSHIFT_2,
-            0,
-        });
         ++it;
     }
 }
 
 /**
- * Compute the limb-multiplication part of a non native field mul
+ * @brief Queue the limb-multiplication part of a non native field mul
  *
  * i.e. compute the low 204 and high 204 bit components of `a * b` where `a, b` are nnf elements composed of 4
  * limbs with size DEFAULT_NON_NATIVE_FIELD_LIMB_BITS
  *
+ * @details The data queued represents part of a non-native field multiplication identity a * b = q * p + r,
+ * where a, b, q, r are all emulated non-native field elements that are each split across 4 distinct witness variables.
+ *
+ * Without this queue some functions, such as proof_system::plonk::stdlib::element::double_montgomery_ladder, would
+ * duplicate non-native field operations, which can be quite expensive. We queue up these operations, and remove
+ * duplicates in the circuit finishing stage of the proving key computation.
  **/
-std::array<uint32_t, 2> UltraComposer::evaluate_partial_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraComposer::queue_partial_non_native_field_multiplication(
     const non_native_field_witnesses& input)
 {
 
@@ -2113,30 +2148,16 @@ std::array<uint32_t, 2> UltraComposer::evaluate_partial_non_native_field_multipl
     const uint32_t hi_0_idx = add_variable(hi_0);
     const uint32_t hi_1_idx = add_variable(hi_1);
 
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(lo_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
-    ++num_gates;
-    w_l.emplace_back(input.a[0]);
-    w_r.emplace_back(input.b[0]);
-    w_o.emplace_back(input.a[3]);
-    w_4.emplace_back(input.b[3]);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
-    ++num_gates;
-    w_l.emplace_back(input.a[2]);
-    w_r.emplace_back(input.b[2]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
-    ++num_gates;
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_1_idx);
-    apply_aux_selectors(AUX_SELECTORS::NONE);
-    ++num_gates;
+    // Add witnesses into the multiplication cache
+    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
+    cached_partial_non_native_field_multiplication cache_entry{
+        .a = input.a,
+        .b = input.b,
+        .lo_0 = lo_0_idx,
+        .hi_0 = hi_0_idx,
+        .hi_1 = hi_1_idx,
+    };
+    cached_partial_non_native_field_multiplications.emplace_back(cache_entry);
     return std::array<uint32_t, 2>{ lo_0_idx, hi_1_idx };
 }
 
diff --git a/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp b/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp
index 6f22b40bb0..eb0e5b0e92 100644
--- a/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp
+++ b/cpp/src/barretenberg/plonk/composer/ultra_composer.hpp
@@ -29,6 +29,11 @@ class UltraComposer : public ComposerBase {
     // large ranges such as 2^64. For such ranges the element will be decomposed into smaller
     // chuncks according to the parameter below
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_BITNUM = 14;
+    // (DEFAULT_PLOOKUP_RANGE_BITNUM - DEFAULT_PLOOKUP_RANGE_CUTOFF_SIZE) = maximum size of range table that
+    // `decompose_into_default_range` will create in addition to the DEFAULT_PLOOKUP_RANGE_BITNUM table e.g. we don't
+    // want to create a range table of size (DEFAULT_PLOOKUP_RANGE_BITNUM - 1) if it contains very few entries; each
+    // table has a O(1 << bitnum) constraint cost to create
+    static constexpr size_t DEFAULT_PLOOKUP_RANGE_CUTOFF_BITNUM = 4;
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_STEP_SIZE = 3;
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_SIZE = (1 << DEFAULT_PLOOKUP_RANGE_BITNUM) - 1;
     static constexpr size_t DEFAULT_NON_NATIVE_FIELD_LIMB_BITS = 68;
@@ -36,7 +41,7 @@ class UltraComposer : public ComposerBase {
     static constexpr size_t NUMBER_OF_GATES_PER_RAM_ACCESS = 2;
     static constexpr size_t NUMBER_OF_ARITHMETIC_GATES_PER_RAM_ARRAY = 1;
     // number of gates created per non-native field operation in process_non_native_field_multiplications
-    static constexpr size_t GATES_PER_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC = 7;
+    static constexpr size_t GATES_PER_PARTIAL_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC = 4;
     struct non_native_field_witnesses {
         // first 4 array elements = limbs
         // 5th element = prime basis limb
@@ -58,30 +63,27 @@ class UltraComposer : public ComposerBase {
     };
 
     /**
-     * @brief Used to store instructions to create non_native_field_multiplication gates.
+     * @brief Used to store instructions to create partial_non_native_field_multiplication gates.
      *        We want to cache these (and remove duplicates) as the stdlib code can end up multiplying the same inputs
      * repeatedly.
      */
-    struct cached_non_native_field_multiplication {
+    struct cached_partial_non_native_field_multiplication {
         std::array<uint32_t, 5> a;
         std::array<uint32_t, 5> b;
-        std::array<uint32_t, 5> q;
-        std::array<uint32_t, 5> r;
-        non_native_field_multiplication_cross_terms cross_terms;
-        std::array<barretenberg::fr, 5> neg_modulus;
+        barretenberg::fr lo_0;
+        barretenberg::fr hi_0;
+        barretenberg::fr hi_1;
 
-        bool operator==(const cached_non_native_field_multiplication& other) const
+        bool operator==(const cached_partial_non_native_field_multiplication& other) const
         {
             bool valid = true;
             for (size_t i = 0; i < 5; ++i) {
                 valid = valid && (a[i] == other.a[i]);
                 valid = valid && (b[i] == other.b[i]);
-                valid = valid && (q[i] == other.q[i]);
-                valid = valid && (r[i] == other.r[i]);
             }
             return valid;
         }
-        bool operator<(const cached_non_native_field_multiplication& other) const
+        bool operator<(const cached_partial_non_native_field_multiplication& other) const
         {
             if (a < other.a) {
                 return true;
@@ -90,22 +92,13 @@ class UltraComposer : public ComposerBase {
                 if (b < other.b) {
                     return true;
                 }
-                if (b == other.b) {
-                    if (q < other.q) {
-                        return true;
-                    }
-                    if (q == other.q) {
-                        if (r < other.r) {
-                            return true;
-                        }
-                    }
-                }
             }
             return false;
         }
     };
 
-    std::vector<cached_non_native_field_multiplication> cached_non_native_field_multiplications;
+    std::vector<cached_partial_non_native_field_multiplication> cached_partial_non_native_field_multiplications;
+
     void process_non_native_field_multiplications();
 
     enum AUX_SELECTORS {
@@ -392,13 +385,23 @@ class UltraComposer : public ComposerBase {
                 rangecount += ram_range_sizes[i];
             }
         }
-        std::vector<cached_non_native_field_multiplication> nnf_copy(cached_non_native_field_multiplications);
+
+        std::vector<cached_partial_non_native_field_multiplication> pnnf_copy(
+            cached_partial_non_native_field_multiplications);
+        for (size_t i = 0; i < pnnf_copy.size(); ++i) {
+            auto& c = pnnf_copy[i];
+            for (size_t j = 0; j < 5; ++j) {
+                c.a[j] = real_variable_index[c.a[j]];
+                c.b[j] = real_variable_index[c.b[j]];
+            }
+        }
         // update nnfcount
-        std::sort(nnf_copy.begin(), nnf_copy.end());
+        std::sort(pnnf_copy.begin(), pnnf_copy.end());
+        auto plast = std::unique(pnnf_copy.begin(), pnnf_copy.end());
 
-        auto last = std::unique(nnf_copy.begin(), nnf_copy.end());
-        const size_t num_nnf_ops = static_cast<size_t>(std::distance(nnf_copy.begin(), last));
-        nnfcount = num_nnf_ops * GATES_PER_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC;
+        nnfcount = static_cast<size_t>(std::distance(pnnf_copy.begin(), plast)) *
+                   GATES_PER_PARTIAL_NON_NATIVE_FIELD_MULTIPLICATION_ARITHMETIC;
+        ;
     }
 
     /**
@@ -545,9 +548,9 @@ class UltraComposer : public ComposerBase {
                                    const size_t hi_limb_bits = DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
         const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder = true);
-    std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
+    std::array<uint32_t, 2> queue_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
     typedef std::pair<uint32_t, barretenberg::fr> scaled_witness;
     typedef std::tuple<scaled_witness, scaled_witness, barretenberg::fr> add_simple;
     std::array<uint32_t, 5> evaluate_non_native_field_subtraction(
diff --git a/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp b/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp
index 0ec324aa47..0ac6de40dc 100644
--- a/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp
+++ b/cpp/src/barretenberg/plonk/composer/ultra_composer.test.cpp
@@ -649,7 +649,7 @@ TYPED_TEST(ultra_composer, non_native_field_multiplication)
     UltraComposer::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = composer.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = composer.evaluate_non_native_field_multiplication(inputs);
     composer.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     TestFixture::prove_and_verify(composer, /*expected_result=*/true);
diff --git a/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp b/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp
index f6cf92bd73..e66d3b56b8 100644
--- a/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp
+++ b/cpp/src/barretenberg/plonk/proof_system/verification_key/verification_key.cpp
@@ -51,43 +51,34 @@ barretenberg::fr compress_native_evaluation_domain(barretenberg::evaluation_doma
  */
 barretenberg::fr verification_key_data::compress_native(const size_t hash_index)
 {
-    barretenberg::evaluation_domain domain = evaluation_domain(circuit_size);
-    barretenberg::fr compressed_domain =
-        compress_native_evaluation_domain(domain, proof_system::ComposerType(composer_type));
-
-    constexpr size_t num_limb_bits = plonk::NUM_LIMB_BITS_IN_FIELD_SIMULATION;
-
-    const auto split_bigfield_limbs = [](const uint256_t& element) {
-        std::vector<barretenberg::fr> limbs;
-        limbs.push_back(element.slice(0, num_limb_bits));
-        limbs.push_back(element.slice(num_limb_bits, num_limb_bits * 2));
-        limbs.push_back(element.slice(num_limb_bits * 2, num_limb_bits * 3));
-        limbs.push_back(element.slice(num_limb_bits * 3, num_limb_bits * 4));
-        return limbs;
-    };
-
-    std::vector<barretenberg::fr> preimage_data;
-    preimage_data.emplace_back(composer_type);
-    preimage_data.emplace_back(compressed_domain);
-    preimage_data.emplace_back(num_public_inputs);
+    barretenberg::evaluation_domain eval_domain = evaluation_domain(circuit_size);
+
+    std::vector<uint8_t> preimage_data;
+
+    preimage_data.push_back(static_cast<uint8_t>(proof_system::ComposerType(composer_type)));
+
+    const uint256_t domain = eval_domain.domain;
+    const uint256_t generator = eval_domain.generator;
+    const uint256_t public_inputs = num_public_inputs;
+
+    ASSERT(domain < (uint256_t(1) << 32));
+    ASSERT(generator < (uint256_t(1) << 16));
+    ASSERT(public_inputs < (uint256_t(1) << 32));
+
+    write(preimage_data, static_cast<uint16_t>(uint256_t(generator)));
+    write(preimage_data, static_cast<uint32_t>(uint256_t(domain)));
+    write(preimage_data, static_cast<uint32_t>(public_inputs));
     for (const auto& [tag, selector] : commitments) {
-        const auto x_limbs = split_bigfield_limbs(selector.x);
-        const auto y_limbs = split_bigfield_limbs(selector.y);
-
-        preimage_data.push_back(x_limbs[0]);
-        preimage_data.push_back(x_limbs[1]);
-        preimage_data.push_back(x_limbs[2]);
-        preimage_data.push_back(x_limbs[3]);
-
-        preimage_data.push_back(y_limbs[0]);
-        preimage_data.push_back(y_limbs[1]);
-        preimage_data.push_back(y_limbs[2]);
-        preimage_data.push_back(y_limbs[3]);
+        write(preimage_data, selector.y);
+        write(preimage_data, selector.x);
     }
 
+    write(preimage_data, eval_domain.root);
+
     barretenberg::fr compressed_key;
-    if (proof_system::ComposerType(composer_type) == proof_system::ComposerType::PLOOKUP) {
-        compressed_key = crypto::pedersen_commitment::lookup::compress_native(preimage_data, hash_index);
+    if (proof_system::ComposerType(composer_type) == ComposerType::PLOOKUP) {
+        compressed_key = from_buffer<barretenberg::fr>(
+            crypto::pedersen_commitment::lookup::compress_native(preimage_data, hash_index));
     } else {
         compressed_key = crypto::pedersen_commitment::compress_native(preimage_data, hash_index);
     }
diff --git a/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp b/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp
index 569ad0ca7f..52812f354c 100644
--- a/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp
+++ b/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.cpp
@@ -1355,7 +1355,7 @@ std::array<uint32_t, 2> UltraCircuitConstructor::decompose_non_native_field_doub
  * @details The data queued represents a non-native field multiplication identity a * b = q * p + r,
  * where a, b, q, r are all emulated non-native field elements that are each split across 4 distinct witness variables.
  *
- * Without this queue some functions, such as proof_system::plonk::stdlib::element::double_montgomery_ladder, would
+ * Without this queue some functions, such as proof_system::plonk::stdlib::element::multiple_montgomery_ladder, would
  * duplicate non-native field operations, which can be quite expensive. We queue up these operations, and remove
  * duplicates in the circuit finishing stage of the proving key computation.
  *
@@ -1365,7 +1365,7 @@ std::array<uint32_t, 2> UltraCircuitConstructor::decompose_non_native_field_doub
  *
  * N.B.: This method does NOT evaluate the prime field component of non-native field multiplications.
  **/
-std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraCircuitConstructor::evaluate_non_native_field_multiplication(
     const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder)
 {
 
@@ -1393,10 +1393,11 @@ std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multipli
         get_variable(input.r[2]),
         get_variable(input.r[3]),
     };
-
     constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
     constexpr barretenberg::fr LIMB_SHIFT_2 = uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_SHIFT_3 = uint256_t(1) << (3 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    constexpr barretenberg::fr LIMB_RSHIFT =
+        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     constexpr barretenberg::fr LIMB_RSHIFT_2 =
         barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
 
@@ -1444,17 +1445,81 @@ std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multipli
         range_constrain_two_limbs(input.q[0], input.q[1]);
         range_constrain_two_limbs(input.q[2], input.q[3]);
     }
-    // Add witnesses into the multiplication cache
-    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
-    cached_non_native_field_multiplication cache_entry{
-        .a = input.a,
-        .b = input.b,
-        .q = input.q,
-        .r = input.r,
-        .cross_terms = { lo_0_idx, lo_1_idx, hi_0_idx, hi_1_idx, hi_2_idx, hi_3_idx },
-        .neg_modulus = input.neg_modulus,
-    };
-    cached_non_native_field_multiplications.emplace_back(cache_entry);
+
+    // product gate 1
+    // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
+    create_big_add_gate({ input.q[0],
+                          input.q[1],
+                          input.r[1],
+                          lo_1_idx,
+                          input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
+                          input.neg_modulus[0] * LIMB_SHIFT,
+                          -LIMB_SHIFT,
+                          -LIMB_SHIFT.sqr(),
+                          0 },
+                        true);
+
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[0]);
+    w_4.emplace_back(lo_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
+    ++num_gates;
+    w_l.emplace_back(input.a[0]);
+    w_r.emplace_back(input.b[0]);
+    w_o.emplace_back(input.a[3]);
+    w_4.emplace_back(input.b[3]);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
+    ++num_gates;
+    w_l.emplace_back(input.a[2]);
+    w_r.emplace_back(input.b[2]);
+    w_o.emplace_back(input.r[3]);
+    w_4.emplace_back(hi_0_idx);
+    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
+    ++num_gates;
+    w_l.emplace_back(input.a[1]);
+    w_r.emplace_back(input.b[1]);
+    w_o.emplace_back(input.r[2]);
+    w_4.emplace_back(hi_1_idx);
+    apply_aux_selectors(AUX_SELECTORS::NONE);
+    ++num_gates;
+
+    /**
+     * product gate 6
+     *
+     * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
+     *
+     **/
+    create_big_add_gate(
+        {
+            input.q[2],
+            input.q[3],
+            lo_1_idx,
+            hi_1_idx,
+            -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
+            -input.neg_modulus[0] * LIMB_SHIFT,
+            -1,
+            -1,
+            0,
+        },
+        true);
+
+    /**
+     * product gate 7
+     *
+     * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
+     **/
+    create_big_add_gate({
+        hi_3_idx,
+        input.q[0],
+        input.q[1],
+        hi_2_idx,
+        -1,
+        input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
+        input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
+        LIMB_RSHIFT_2,
+        0,
+    });
 
     return std::array<uint32_t, 2>{ lo_1_idx, hi_3_idx };
 }
@@ -1466,46 +1531,29 @@ std::array<uint32_t, 2> UltraCircuitConstructor::queue_non_native_field_multipli
  */
 void UltraCircuitConstructor::process_non_native_field_multiplications()
 {
-    std::sort(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
-
-    auto last =
-        std::unique(cached_non_native_field_multiplications.begin(), cached_non_native_field_multiplications.end());
+    for (size_t i = 0; i < cached_partial_non_native_field_multiplications.size(); ++i) {
+        auto& c = cached_partial_non_native_field_multiplications[i];
+        for (size_t j = 0; j < 5; ++j) {
+            c.a[j] = real_variable_index[c.a[j]];
+            c.b[j] = real_variable_index[c.b[j]];
+        }
+    }
+    std::sort(cached_partial_non_native_field_multiplications.begin(),
+              cached_partial_non_native_field_multiplications.end());
 
-    auto it = cached_non_native_field_multiplications.begin();
+    auto last = std::unique(cached_partial_non_native_field_multiplications.begin(),
+                            cached_partial_non_native_field_multiplications.end());
 
-    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
-    constexpr barretenberg::fr LIMB_RSHIFT =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
-    constexpr barretenberg::fr LIMB_RSHIFT_2 =
-        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
+    auto it = cached_partial_non_native_field_multiplications.begin();
 
     // iterate over the cached items and create constraints
     while (it != last) {
         const auto input = *it;
-        const uint32_t lo_0_idx = input.cross_terms.lo_0_idx;
-        const uint32_t lo_1_idx = input.cross_terms.lo_1_idx;
-        const uint32_t hi_0_idx = input.cross_terms.hi_0_idx;
-        const uint32_t hi_1_idx = input.cross_terms.hi_1_idx;
-        const uint32_t hi_2_idx = input.cross_terms.hi_2_idx;
-        const uint32_t hi_3_idx = input.cross_terms.hi_3_idx;
-
-        // product gate 1
-        // (lo_0 + q_0(p_0 + p_1*2^b) + q_1(p_0*2^b) - (r_1)2^b)2^-2b - lo_1 = 0
-        create_big_add_gate({ input.q[0],
-                              input.q[1],
-                              input.r[1],
-                              lo_1_idx,
-                              input.neg_modulus[0] + input.neg_modulus[1] * LIMB_SHIFT,
-                              input.neg_modulus[0] * LIMB_SHIFT,
-                              -LIMB_SHIFT,
-                              -LIMB_SHIFT.sqr(),
-                              0 },
-                            true);
 
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[0]);
-        w_4.emplace_back(lo_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.lo_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
         ++num_gates;
         w_l.emplace_back(input.a[0]);
@@ -1516,53 +1564,16 @@ void UltraCircuitConstructor::process_non_native_field_multiplications()
         ++num_gates;
         w_l.emplace_back(input.a[2]);
         w_r.emplace_back(input.b[2]);
-        w_o.emplace_back(input.r[3]);
-        w_4.emplace_back(hi_0_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_0);
         apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
         ++num_gates;
         w_l.emplace_back(input.a[1]);
         w_r.emplace_back(input.b[1]);
-        w_o.emplace_back(input.r[2]);
-        w_4.emplace_back(hi_1_idx);
+        w_o.emplace_back(zero_idx);
+        w_4.emplace_back(input.hi_1);
         apply_aux_selectors(AUX_SELECTORS::NONE);
         ++num_gates;
-
-        /**
-         * product gate 6
-         *
-         * hi_2 - hi_1 - lo_1 - q[2](p[1].2^b + p[0]) - q[3](p[0].2^b) = 0
-         *
-         **/
-        create_big_add_gate(
-            {
-                input.q[2],
-                input.q[3],
-                lo_1_idx,
-                hi_1_idx,
-                -input.neg_modulus[1] * LIMB_SHIFT - input.neg_modulus[0],
-                -input.neg_modulus[0] * LIMB_SHIFT,
-                -1,
-                -1,
-                0,
-            },
-            true);
-
-        /**
-         * product gate 7
-         *
-         * hi_3 - (hi_2 - q[0](p[3].2^b + p[2]) - q[1](p[2].2^b + p[1])).2^-2b
-         **/
-        create_big_add_gate({
-            hi_3_idx,
-            input.q[0],
-            input.q[1],
-            hi_2_idx,
-            -1,
-            input.neg_modulus[3] * LIMB_RSHIFT + input.neg_modulus[2] * LIMB_RSHIFT_2,
-            input.neg_modulus[2] * LIMB_RSHIFT + input.neg_modulus[1] * LIMB_RSHIFT_2,
-            LIMB_RSHIFT_2,
-            0,
-        });
         ++it;
     }
 }
@@ -1574,7 +1585,7 @@ void UltraCircuitConstructor::process_non_native_field_multiplications()
  * limbs with size DEFAULT_NON_NATIVE_FIELD_LIMB_BITS
  *
  **/
-std::array<uint32_t, 2> UltraCircuitConstructor::evaluate_partial_non_native_field_multiplication(
+std::array<uint32_t, 2> UltraCircuitConstructor::queue_partial_non_native_field_multiplication(
     const non_native_field_witnesses& input)
 {
 
@@ -1602,30 +1613,16 @@ std::array<uint32_t, 2> UltraCircuitConstructor::evaluate_partial_non_native_fie
     const uint32_t hi_0_idx = add_variable(hi_0);
     const uint32_t hi_1_idx = add_variable(hi_1);
 
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(lo_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_1);
-    ++num_gates;
-    w_l.emplace_back(input.a[0]);
-    w_r.emplace_back(input.b[0]);
-    w_o.emplace_back(input.a[3]);
-    w_4.emplace_back(input.b[3]);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_2);
-    ++num_gates;
-    w_l.emplace_back(input.a[2]);
-    w_r.emplace_back(input.b[2]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_0_idx);
-    apply_aux_selectors(AUX_SELECTORS::NON_NATIVE_FIELD_3);
-    ++num_gates;
-    w_l.emplace_back(input.a[1]);
-    w_r.emplace_back(input.b[1]);
-    w_o.emplace_back(zero_idx);
-    w_4.emplace_back(hi_1_idx);
-    apply_aux_selectors(AUX_SELECTORS::NONE);
-    ++num_gates;
+    // Add witnesses into the multiplication cache
+    // (when finalising the circuit, we will remove duplicates; several dups produced by biggroup.hpp methods)
+    cached_partial_non_native_field_multiplication cache_entry{
+        .a = input.a,
+        .b = input.b,
+        .lo_0 = lo_0_idx,
+        .hi_0 = hi_0_idx,
+        .hi_1 = hi_1_idx,
+    };
+    cached_partial_non_native_field_multiplications.emplace_back(cache_entry);
     return std::array<uint32_t, 2>{ lo_0_idx, hi_1_idx };
 }
 
diff --git a/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp b/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp
index 84b18b13bc..21bc952393 100644
--- a/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp
+++ b/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.hpp
@@ -167,45 +167,29 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
         }
     };
 
-    inline std::vector<std::string> ultra_selector_names()
-    {
-        std::vector<std::string> result{ "q_m",     "q_c",    "q_1",        "q_2",   "q_3",       "q_4",
-                                         "q_arith", "q_sort", "q_elliptic", "q_aux", "table_type" };
-        return result;
-    }
-    struct non_native_field_multiplication_cross_terms {
-        uint32_t lo_0_idx;
-        uint32_t lo_1_idx;
-        uint32_t hi_0_idx;
-        uint32_t hi_1_idx;
-        uint32_t hi_2_idx;
-        uint32_t hi_3_idx;
-    };
     /**
-     * @brief Used to store instructions to create non_native_field_multiplication gates.
+     * @brief Used to store instructions to create partial_non_native_field_multiplication gates.
      *        We want to cache these (and remove duplicates) as the stdlib code can end up multiplying the same inputs
      * repeatedly.
      */
-    struct cached_non_native_field_multiplication {
+    struct cached_partial_non_native_field_multiplication {
         std::array<uint32_t, 5> a;
         std::array<uint32_t, 5> b;
-        std::array<uint32_t, 5> q;
-        std::array<uint32_t, 5> r;
-        non_native_field_multiplication_cross_terms cross_terms;
-        std::array<barretenberg::fr, 5> neg_modulus;
+        barretenberg::fr lo_0;
+        barretenberg::fr hi_0;
+        barretenberg::fr hi_1;
 
-        bool operator==(const cached_non_native_field_multiplication& other) const
+        bool operator==(const cached_partial_non_native_field_multiplication& other) const
         {
             bool valid = true;
             for (size_t i = 0; i < 5; ++i) {
                 valid = valid && (a[i] == other.a[i]);
                 valid = valid && (b[i] == other.b[i]);
-                valid = valid && (q[i] == other.q[i]);
-                valid = valid && (r[i] == other.r[i]);
             }
             return valid;
         }
-        bool operator<(const cached_non_native_field_multiplication& other) const
+
+        bool operator<(const cached_partial_non_native_field_multiplication& other) const
         {
             if (a < other.a) {
                 return true;
@@ -214,27 +198,33 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
                 if (b < other.b) {
                     return true;
                 }
-                if (b == other.b) {
-                    if (q < other.q) {
-                        return true;
-                    }
-                    if (q == other.q) {
-                        if (r < other.r) {
-                            return true;
-                        }
-                    }
-                }
             }
             return false;
         }
     };
+
+    inline std::vector<std::string> ultra_selector_names()
+    {
+        std::vector<std::string> result{ "q_m",     "q_c",    "q_1",        "q_2",   "q_3",       "q_4",
+                                         "q_arith", "q_sort", "q_elliptic", "q_aux", "table_type" };
+        return result;
+    }
+    struct non_native_field_multiplication_cross_terms {
+        uint32_t lo_0_idx;
+        uint32_t lo_1_idx;
+        uint32_t hi_0_idx;
+        uint32_t hi_1_idx;
+        uint32_t hi_2_idx;
+        uint32_t hi_3_idx;
+    };
+
     /**
-     * @brief CircuitDataBackup is a structure we use to store all the information about the circuit that is needed to
-     * restore it back to a pre-finalized state
-     * @details In check_circuit method in UltraCircuitConstructor we want to check that the whole circuit works, but
-     * ultra circuits need to have ram, rom and range gates added in the end for the check to be complete as well as the
-     * set permutation check, so we finalize the circuit when we check it. This structure allows us to restore the
-     * circuit to the state before the finalization.
+     * @brief CircuitDataBackup is a structure we use to store all the information about the circuit that is needed
+     * to restore it back to a pre-finalized state
+     * @details In check_circuit method in UltraCircuitConstructor we want to check that the whole circuit works,
+     * but ultra circuits need to have ram, rom and range gates added in the end for the check to be complete as
+     * well as the set permutation check, so we finalize the circuit when we check it. This structure allows us to
+     * restore the circuit to the state before the finalization.
      */
     struct CircuitDataBackup {
         std::vector<uint32_t> public_inputs;
@@ -272,8 +262,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
         std::vector<uint32_t> memory_write_records;
         std::map<uint64_t, RangeList> range_lists;
 
-        std::vector<UltraCircuitConstructor::cached_non_native_field_multiplication>
-            cached_non_native_field_multiplications;
+        std::vector<UltraCircuitConstructor::cached_partial_non_native_field_multiplication>
+            cached_partial_non_native_field_multiplications;
 
         size_t num_gates;
         bool circuit_finalised = false;
@@ -326,14 +316,14 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             stored_state.range_lists = circuit_constructor.range_lists;
             stored_state.circuit_finalised = circuit_constructor.circuit_finalised;
             stored_state.num_gates = circuit_constructor.num_gates;
-            stored_state.cached_non_native_field_multiplications =
-                circuit_constructor.cached_non_native_field_multiplications;
+            stored_state.cached_partial_non_native_field_multiplications =
+                circuit_constructor.cached_partial_non_native_field_multiplications;
             return stored_state;
         }
 
         /**
-         * @brief Stores the state of all members of the circuit constructor that are needed to restore the state after
-         * finalizing the circuit.
+         * @brief Stores the state of all members of the circuit constructor that are needed to restore the state
+         * after finalizing the circuit.
          *
          * @param circuit_constructor
          * @return CircuitDataBackup
@@ -363,8 +353,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             stored_state.range_lists = circuit_constructor->range_lists;
             stored_state.circuit_finalised = circuit_constructor->circuit_finalised;
             stored_state.num_gates = circuit_constructor->num_gates;
-            stored_state.cached_non_native_field_multiplications =
-                circuit_constructor->cached_non_native_field_multiplications;
+            stored_state.cached_partial_non_native_field_multiplications =
+                circuit_constructor->cached_partial_non_native_field_multiplications;
 
             return stored_state;
         }
@@ -398,7 +388,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             circuit_constructor->range_lists = range_lists;
             circuit_constructor->circuit_finalised = circuit_finalised;
             circuit_constructor->num_gates = num_gates;
-            circuit_constructor->cached_non_native_field_multiplications = cached_non_native_field_multiplications;
+            circuit_constructor->cached_partial_non_native_field_multiplications =
+                cached_partial_non_native_field_multiplications;
             circuit_constructor->w_l.resize(num_gates);
             circuit_constructor->w_r.resize(num_gates);
             circuit_constructor->w_o.resize(num_gates);
@@ -511,8 +502,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             if (!(range_lists == circuit_constructor.range_lists)) {
                 return false;
             }
-            if (!(cached_non_native_field_multiplications ==
-                  circuit_constructor.cached_non_native_field_multiplications)) {
+            if (!(cached_partial_non_native_field_multiplications ==
+                  circuit_constructor.cached_partial_non_native_field_multiplications)) {
                 return false;
             }
             if (!(num_gates == circuit_constructor.num_gates)) {
@@ -555,7 +546,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
      * @brief Each entry in ram_arrays represents an independent RAM table.
      * RamTranscript tracks the current table state,
      * as well as the 'records' produced by each read and write operation.
-     * Used in `compute_proving_key` to generate consistency check gates required to validate the RAM read/write history
+     * Used in `compute_proving_key` to generate consistency check gates required to validate the RAM read/write
+     * history
      */
     std::vector<RamTranscript> ram_arrays;
 
@@ -572,7 +564,7 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
     // Stores gate index of RAM writes (required by proving key)
     std::vector<uint32_t> memory_write_records;
 
-    std::vector<cached_non_native_field_multiplication> cached_non_native_field_multiplications;
+    std::vector<cached_partial_non_native_field_multiplication> cached_partial_non_native_field_multiplications;
 
     void process_non_native_field_multiplications();
 
@@ -638,11 +630,11 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
             /**
              * N.B. if `variable_index` is not used in any arithmetic constraints, this will create an unsatisfiable
              *      circuit!
-             *      this range constraint will increase the size of the 'sorted set' of range-constrained integers by 1.
-             *      The 'non-sorted set' of range-constrained integers is a subset of the wire indices of all arithmetic
-             *      gates. No arithemtic gate => size imbalance between sorted and non-sorted sets. Checking for this
-             *      and throwing an error would require a refactor of the Composer to catelog all 'orphan' variables not
-             *      assigned to gates.
+             *      this range constraint will increase the size of the 'sorted set' of range-constrained integers
+             *by 1. The 'non-sorted set' of range-constrained integers is a subset of the wire indices of all
+             *arithmetic gates. No arithemtic gate => size imbalance between sorted and non-sorted sets. Checking
+             *for this and throwing an error would require a refactor of the Composer to catelog all 'orphan'
+             *variables not assigned to gates.
              **/
             create_new_range_constraint(variable_index, 1ULL << num_bits, msg);
         } else {
@@ -704,8 +696,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
     //             }
     //         }
     //         ramcount += (ram_arrays[i].records.size() * NUMBER_OF_GATES_PER_RAM_ACCESS);
-    //         ramcount += NUMBER_OF_ARITHMETIC_GATES_PER_RAM_ARRAY; // we add an addition gate after procesing a ram
-    //         array
+    //         ramcount += NUMBER_OF_ARITHMETIC_GATES_PER_RAM_ARRAY; // we add an addition gate after procesing a
+    //         ram array
 
     //         // there will be 'max_timestamp' number of range checks, need to calculate.
     //         const auto max_timestamp = ram_arrays[i].access_count - 1;
@@ -719,7 +711,8 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
     //         const size_t ram_range_check_list_size = max_timestamp + padding;
 
     //         size_t ram_range_check_gate_count = (ram_range_check_list_size / gate_width);
-    //         ram_range_check_gate_count += 1; // we need to add 1 extra addition gates for every distinct range list
+    //         ram_range_check_gate_count += 1; // we need to add 1 extra addition gates for every distinct range
+    //         list
 
     //         ram_range_sizes.push_back(ram_range_check_gate_count);
     //         ram_range_exists.push_back(false);
@@ -885,9 +878,9 @@ class UltraCircuitConstructor : public CircuitConstructorBase<arithmetization::U
                                    const size_t hi_limb_bits = DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
     std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
         const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
-    std::array<uint32_t, 2> queue_non_native_field_multiplication(
+    std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
         const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder = true);
-    std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
+    std::array<uint32_t, 2> queue_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
     typedef std::pair<uint32_t, barretenberg::fr> scaled_witness;
     typedef std::tuple<scaled_witness, scaled_witness, barretenberg::fr> add_simple;
     std::array<uint32_t, 5> evaluate_non_native_field_subtraction(
diff --git a/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp b/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp
index aea6e78bde..1b9706e756 100644
--- a/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp
+++ b/cpp/src/barretenberg/proof_system/circuit_constructors/ultra_circuit_constructor.test.cpp
@@ -643,7 +643,7 @@ TEST(ultra_circuit_constructor, non_native_field_multiplication)
     proof_system::UltraCircuitConstructor::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    const auto [lo_1_idx, hi_1_idx] = circuit_constructor.queue_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = circuit_constructor.evaluate_non_native_field_multiplication(inputs);
     circuit_constructor.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     auto saved_state = UltraCircuitConstructor::CircuitDataBackup::store_full_state(circuit_constructor);
diff --git a/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp b/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp
index a6dcc0b87c..7274600b84 100644
--- a/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp
+++ b/cpp/src/barretenberg/solidity_helpers/circuits/recursive_circuit.hpp
@@ -125,7 +125,7 @@ template <typename OuterComposer> class RecursiveCircuit {
             throw_or_abort("inner proof result != 1");
         }
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         if (outer_composer.failed()) {
             throw_or_abort("outer composer failed");
diff --git a/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp b/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp
index 6dbc70e5d1..c5ba72f8d3 100644
--- a/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp
+++ b/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.cpp
@@ -14,19 +14,34 @@ using namespace plookup;
 using namespace barretenberg;
 
 template <typename C>
-point<C> pedersen_plookup_commitment<C>::compress_to_point(const field_t& left, const field_t& right)
+point<C> pedersen_plookup_commitment<C>::compress_to_point(const field_t& left,
+                                                           const field_t& right,
+                                                           const bool skip_rhs_range_check)
 {
     auto p2 = pedersen_plookup_hash<C>::hash_single(left, false);
-    auto p1 = pedersen_plookup_hash<C>::hash_single(right, true);
+    auto p1 = pedersen_plookup_hash<C>::hash_single(right, true, skip_rhs_range_check);
 
     return pedersen_plookup_hash<C>::add_points(p1, p2);
 }
 
-template <typename C> field_t<C> pedersen_plookup_commitment<C>::compress(const field_t& left, const field_t& right)
+template <typename C>
+field_t<C> pedersen_plookup_commitment<C>::compress(const field_t& left,
+                                                    const field_t& right,
+                                                    const bool skip_rhs_range_check)
 {
-    return compress_to_point(left, right).x;
+    return compress_to_point(left, right, skip_rhs_range_check).x;
 }
 
+/**
+ * @brief Compress a vector of field elements into a grumpkin point.
+ * This serves as the basis for a collision-resistant hash function.
+ * Note that this does NOT produce a hash that can be modelled as a random oracle.
+ *
+ * @tparam C
+ * @param inputs
+ * @param iv initialization vector
+ * @return point<C>
+ */
 template <typename C>
 point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vector<field_t>& inputs, const field_t& iv)
 {
@@ -34,13 +49,19 @@ point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vect
         return point{ 0, 0 };
     }
 
+    // The first two inputs to the Merkle-Damgard construction are the initialization vector and the number of elements
+    // being hashed. Including the length ensures that hashes of different lengths cannot collide. Starting the hash
+    // with these 2 inputs is optimal in the case that the IV is constant. i.e. the 1st 3 calls to `hash_single` are
+    // over constants and cost no constraints. r = H(iv, num_inputs) is constant and the 1st half of H(r, inputs[0]) is
+    // also constant
     auto result = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, iv)[ColumnIdx::C2][0];
     auto num_inputs = inputs.size();
-    for (size_t i = 0; i < num_inputs; i++) {
+    result = compress(result, field_t(num_inputs));
+    for (size_t i = 0; i < num_inputs - 1; i++) {
         result = compress(result, inputs[i]);
     }
 
-    return compress_to_point(result, field_t(num_inputs));
+    return compress_to_point(result, inputs[num_inputs - 1]);
 }
 
 template <typename C>
@@ -53,7 +74,9 @@ point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vect
     }
 
     auto result = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, 0)[ColumnIdx::C2][0];
-    for (size_t i = 0; i < 2 * num_inputs; i++) {
+    result = compress(result, field_t(num_inputs));
+
+    for (size_t i = 0; i < 2 * num_inputs - 1; i++) {
         if ((i & 1) == 0) {
             auto iv_result =
                 plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, ivs[i >> 1])[ColumnIdx::C2][0];
@@ -63,7 +86,25 @@ point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress(const std::vect
         }
     }
 
-    return compress_to_point(result, field_t(num_inputs));
+    return compress_to_point(result, inputs[num_inputs - 1]);
+}
+
+template <typename C>
+point<C> pedersen_plookup_commitment<C>::merkle_damgard_compress_with_relaxed_range_constraints(
+    const std::vector<field_t>& inputs, const field_t& iv)
+{
+    if (inputs.size() == 0) {
+        return point{ 0, 0 };
+    }
+
+    auto result = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_IV, iv)[ColumnIdx::C2][0];
+    auto num_inputs = inputs.size();
+    result = compress(result, field_t(num_inputs));
+    for (size_t i = 0; i < num_inputs - 1; i++) {
+        result = compress(result, inputs[i], true);
+    }
+
+    return compress_to_point(result, inputs[num_inputs - 1], true);
 }
 
 template <typename C>
@@ -102,6 +143,13 @@ point<C> pedersen_plookup_commitment<C>::commit(const std::vector<field_t>& inpu
     return merkle_damgard_compress(inputs, field_t(hash_index));
 }
 
+template <typename C>
+point<C> pedersen_plookup_commitment<C>::commit_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                                               const size_t hash_index)
+{
+    return merkle_damgard_compress_with_relaxed_range_constraints(inputs, field_t(hash_index));
+}
+
 template <typename C>
 point<C> pedersen_plookup_commitment<C>::commit(const std::vector<field_t>& inputs,
                                                 const std::vector<size_t>& hash_indices)
@@ -114,6 +162,24 @@ point<C> pedersen_plookup_commitment<C>::commit(const std::vector<field_t>& inpu
     return merkle_damgard_compress(inputs, hash_indices_);
 }
 
+/**
+ * @brief Calls `compress` but instructs the Pedersen hash method `hash_single`
+ * to not apply range constraints on the input elements.
+ *
+ * Use this method when the input elements are known to be <= 2^252
+ *
+ * @tparam C
+ * @param inputs
+ * @param hash_index
+ * @return field_t<C>
+ */
+template <typename C>
+field_t<C> pedersen_plookup_commitment<C>::compress_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                                                   const size_t hash_index)
+{
+    return commit_with_relaxed_range_constraints(inputs, hash_index).x;
+}
+
 template <typename C>
 field_t<C> pedersen_plookup_commitment<C>::compress(const std::vector<field_t>& inputs, const size_t hash_index)
 {
diff --git a/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp b/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp
index 42cc8252d2..90076a3035 100644
--- a/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp
+++ b/cpp/src/barretenberg/stdlib/commitment/pedersen/pedersen_plookup.hpp
@@ -17,14 +17,18 @@ template <typename ComposerContext> class pedersen_plookup_commitment {
   public:
     static point commit(const std::vector<field_t>& inputs, const size_t hash_index = 0);
     static point commit(const std::vector<field_t>& inputs, const std::vector<size_t>& hash_indices);
+    static point commit_with_relaxed_range_constraints(const std::vector<field_t>& inputs, const size_t hash_index = 0);
 
-    static field_t compress(const field_t& left, const field_t& right);
+    static field_t compress(const field_t& left, const field_t& right, const bool skip_rhs_range_check = false);
     static field_t compress(const std::vector<field_t>& inputs, const size_t hash_index = 0);
     static field_t compress(const packed_byte_array& input) { return compress(input.get_limbs()); }
 
     static field_t compress(const std::vector<field_t>& inputs, const std::vector<size_t>& hash_indices);
     static field_t compress(const std::vector<std::pair<field_t, size_t>>& input_pairs);
 
+    static field_t compress_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                           const size_t hash_index = 0);
+
     template <size_t T> static field_t compress(const std::array<field_t, T>& inputs)
     {
         std::vector<field_t> in(inputs.begin(), inputs.end());
@@ -33,9 +37,12 @@ template <typename ComposerContext> class pedersen_plookup_commitment {
 
     static point merkle_damgard_compress(const std::vector<field_t>& inputs, const field_t& iv);
     static point merkle_damgard_compress(const std::vector<field_t>& inputs, const std::vector<field_t>& ivs);
+    static point merkle_damgard_compress_with_relaxed_range_constraints(const std::vector<field_t>& inputs,
+                                                                        const field_t& iv);
+
     static point merkle_damgard_tree_compress(const std::vector<field_t>& inputs, const std::vector<field_t>& ivs);
 
-    static point compress_to_point(const field_t& left, const field_t& right);
+    static point compress_to_point(const field_t& left, const field_t& right, const bool skip_rhs_range_check = false);
 };
 
 extern template class pedersen_plookup_commitment<UltraComposer>;
diff --git a/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp b/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp
index 2801648f54..9b5e84cda0 100644
--- a/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp
+++ b/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.cpp
@@ -79,7 +79,8 @@ point<C> pedersen_plookup_hash<C>::add_points(const point& p1, const point& p2,
 /**
  * Hash a single field element using lookup tables.
  */
-template <typename C> point<C> pedersen_plookup_hash<C>::hash_single(const field_t& scalar, const bool parity)
+template <typename C>
+point<C> pedersen_plookup_hash<C>::hash_single(const field_t& scalar, const bool parity, const bool skip_range_check)
 {
     if (scalar.is_constant()) {
         C* ctx = scalar.get_context();
@@ -93,6 +94,10 @@ template <typename C> point<C> pedersen_plookup_hash<C>::hash_single(const field
     const field_t y_lo = witness_t(ctx, uint256_t(scalar.get_value()).slice(0, 126));
 
     ReadData<field_t> lookup_hi, lookup_lo;
+
+    // If `skip_range_check = true`, this implies the input scalar is 252 bits maximum.
+    // i.e. we do not require a check that scalar slice sums < p .
+    // We can also likely use a multitable with 1 less lookup
     if (parity) {
         lookup_lo = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_RIGHT_LO, y_lo);
         lookup_hi = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_RIGHT_HI, y_hi);
@@ -101,17 +106,35 @@ template <typename C> point<C> pedersen_plookup_hash<C>::hash_single(const field
         lookup_hi = plookup_read::get_lookup_accumulators(MultiTableId::PEDERSEN_LEFT_HI, y_hi);
     }
 
-    // Check if (r_hi - y_hi) is 128 bits and if (r_hi - y_hi) == 0, then
-    // (r_lo - y_lo) must be 126 bits.
-    constexpr uint256_t modulus = fr::modulus;
-    const field_t r_lo = witness_t(ctx, modulus.slice(0, 126));
-    const field_t r_hi = witness_t(ctx, modulus.slice(126, 256));
+    // validate slices equal scalar
+    // TODO(suyash?): can remove this gate if we use a single lookup accumulator for HI + LO combined
+    //       can recover y_hi, y_lo from Column 1 of the the lookup accumulator output
+    scalar.add_two(-y_hi * (uint256_t(1) << 126), -y_lo).assert_equal(0);
+
+    // if skip_range_check = true we assume input max size is 252 bits => final lookup scalar slice value must be 0
+    if (skip_range_check) {
+        lookup_hi[ColumnIdx::C1][lookup_hi[ColumnIdx::C1].size() - 1].assert_equal(0);
+    }
+    if (!skip_range_check) {
+        // Check that y_hi * 2^126 + y_lo < fr::modulus when evaluated over the integers
+        constexpr uint256_t modulus = fr::modulus;
+        const field_t r_lo = field_t(ctx, modulus.slice(0, 126));
+        const field_t r_hi = field_t(ctx, modulus.slice(126, 256));
 
-    const field_t term_hi = r_hi - y_hi;
-    const field_t term_lo = (r_lo - y_lo) * field_t(term_hi == field_t(0));
-    term_hi.normalize().create_range_constraint(128);
-    term_lo.normalize().create_range_constraint(126);
+        bool need_borrow = (uint256_t(y_lo.get_value()) > uint256_t(r_lo.get_value()));
+        field_t borrow = field_t::from_witness(ctx, need_borrow);
 
+        // directly call `create_new_range_constraint` to avoid creating an arithmetic gate
+        scalar.get_context()->create_new_range_constraint(borrow.get_witness_index(), 1, "borrow");
+
+        // Hi range check = r_hi - y_hi - borrow
+        // Lo range check = r_lo - y_lo + borrow * 2^{126}
+        field_t hi = (r_hi - y_hi) - borrow;
+        field_t lo = (r_lo - y_lo) + (borrow * (uint256_t(1) << 126));
+
+        hi.create_range_constraint(128);
+        lo.create_range_constraint(126);
+    }
     const size_t num_lookups_lo = lookup_lo[ColumnIdx::C1].size();
     const size_t num_lookups_hi = lookup_hi[ColumnIdx::C1].size();
 
diff --git a/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp b/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp
index 5d099c5c3f..2467e3fdb5 100644
--- a/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp
+++ b/cpp/src/barretenberg/stdlib/hash/pedersen/pedersen_plookup.hpp
@@ -23,7 +23,7 @@ template <typename ComposerContext> class pedersen_plookup_hash {
   public:
     static point add_points(const point& p1, const point& p2, const AddType add_type = ONE);
 
-    static point hash_single(const field_t& in, const bool parity);
+    static point hash_single(const field_t& in, const bool parity, const bool skip_range_check = false);
 
     static field_t hash_multiple(const std::vector<field_t>& in, const size_t hash_index = 0);
 };
diff --git a/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp b/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp
index d324bcc4af..96d4c022df 100644
--- a/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/bigfield/bigfield_impl.hpp
@@ -593,18 +593,19 @@ template <typename C, typename T> bigfield<C, T> bigfield<C, T>::operator-(const
     result.binary_basis_limbs[3].element = binary_basis_limbs[3].element + barretenberg::fr(to_add_3);
 
     if constexpr (C::type == ComposerType::PLOOKUP) {
-        if (result.prime_basis_limb.multiplicative_constant == 1 &&
-            other.prime_basis_limb.multiplicative_constant == 1 && !result.is_constant() && !other.is_constant()) {
+        if (prime_basis_limb.multiplicative_constant == 1 && other.prime_basis_limb.multiplicative_constant == 1 &&
+            !is_constant() && !other.is_constant()) {
             bool limbconst = result.binary_basis_limbs[0].element.is_constant();
             limbconst = limbconst || result.binary_basis_limbs[1].element.is_constant();
             limbconst = limbconst || result.binary_basis_limbs[2].element.is_constant();
             limbconst = limbconst || result.binary_basis_limbs[3].element.is_constant();
-            limbconst = limbconst || result.prime_basis_limb.is_constant();
+            limbconst = limbconst || prime_basis_limb.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[0].element.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[1].element.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[2].element.is_constant();
             limbconst = limbconst || other.binary_basis_limbs[3].element.is_constant();
             limbconst = limbconst || other.prime_basis_limb.is_constant();
+            limbconst = limbconst || (prime_basis_limb.witness_index == other.prime_basis_limb.witness_index);
             if (!limbconst) {
                 std::pair<uint32_t, barretenberg::fr> x0{ result.binary_basis_limbs[0].element.witness_index,
                                                           binary_basis_limbs[0].element.multiplicative_constant };
@@ -631,10 +632,11 @@ template <typename C, typename T> bigfield<C, T> bigfield<C, T>::operator-(const
                 barretenberg::fr c3(result.binary_basis_limbs[3].element.additive_constant -
                                     other.binary_basis_limbs[3].element.additive_constant);
 
-                uint32_t xp(result.prime_basis_limb.witness_index);
+                uint32_t xp(prime_basis_limb.witness_index);
                 uint32_t yp(other.prime_basis_limb.witness_index);
-                barretenberg::fr cp(result.prime_basis_limb.additive_constant -
-                                    other.prime_basis_limb.additive_constant);
+                barretenberg::fr cp(prime_basis_limb.additive_constant - other.prime_basis_limb.additive_constant);
+                uint512_t constant_to_add_mod_p = (constant_to_add) % prime_basis.modulus;
+                cp += barretenberg::fr(constant_to_add_mod_p.lo);
 
                 const auto output_witnesses = ctx->evaluate_non_native_field_subtraction(
                     { x0, y0, c0 }, { x1, y1, c1 }, { x2, y2, c2 }, { x3, y3, c3 }, { xp, yp, cp });
@@ -1982,7 +1984,7 @@ void bigfield<C, T>::unsafe_evaluate_multiply_add(const bigfield& input_left,
             modulus,
         };
         // N.B. this method also evaluates the prime field component of the non-native field mul
-        const auto [lo_idx, hi_idx] = ctx->queue_non_native_field_multiplication(witnesses, false);
+        const auto [lo_idx, hi_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false);
 
         barretenberg::fr neg_prime = -barretenberg::fr(uint256_t(target_basis.modulus));
         field_t<C>::evaluate_polynomial_identity(left.prime_basis_limb,
@@ -2267,7 +2269,7 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
         // we set `neg_modulus = [2^{136}, 0, 0, 0]` and `q = [lo_1, 0, hi_1, 0]`, then we will add `lo_1` into
         // `lo`, and `lo_1/2^{136} + hi_1` into `hi`. we can then subtract off `lo_1/2^{136}` from `hi`, by setting
         // `r = [0, 0, lo_1, 0]` This saves us 2 addition gates as we don't have to add together the outputs of two
-        // calls to `queue_non_native_field_multiplication`
+        // calls to `evaluate_non_native_field_multiplication`
         std::vector<field_t<C>> limb_0_accumulator;
         std::vector<field_t<C>> limb_2_accumulator;
         std::vector<field_t<C>> prime_limb_accumulator;
@@ -2320,7 +2322,7 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
                     modulus,
                 };
 
-                const auto [lo_2_idx, hi_2_idx] = ctx->evaluate_partial_non_native_field_multiplication(mul_witnesses);
+                const auto [lo_2_idx, hi_2_idx] = ctx->queue_partial_non_native_field_multiplication(mul_witnesses);
 
                 field_t<C> lo_2 = field_t<C>::from_witness_index(ctx, lo_2_idx);
                 field_t<C> hi_2 = field_t<C>::from_witness_index(ctx, hi_2_idx);
@@ -2416,7 +2418,7 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
             modulus,
         };
 
-        const auto [lo_1_idx, hi_1_idx] = ctx->queue_non_native_field_multiplication(witnesses, false);
+        const auto [lo_1_idx, hi_1_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false);
 
         barretenberg::fr neg_prime = -barretenberg::fr(uint256_t(target_basis.modulus));
 
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp
index 29d8e472c1..4330c5b6b4 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.hpp
@@ -50,15 +50,15 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
 
     void validate_on_curve() const
     {
-        Fq xx = x.sqr();
-        Fq rhs = y.sqr();
         Fq b(get_context(), uint256_t(NativeGroup::curve_b));
-        Fq lhs = xx.madd(x, { b });
-        if constexpr (NativeGroup::has_a) {
+        if constexpr (!NativeGroup::has_a) {
+            // we validate y^2 = x^3 + b by setting "fix_remainder_zero = true" when calling mult_madd
+            Fq::mult_madd({ x.sqr(), y }, { x, -y }, { b }, true);
+        } else {
             Fq a(get_context(), uint256_t(NativeGroup::curve_a));
-            lhs = lhs + (a * x);
+            // we validate y^2 = x^3 + ax + b by setting "fix_remainder_zero = true" when calling mult_madd
+            Fq::mult_madd({ x.sqr(), x, y }, { -x, a, y }, { b }, true);
         }
-        lhs.assert_equal(rhs);
     }
 
     static element one(Composer* ctx)
@@ -99,6 +99,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         *this = *this - other;
         return *this;
     }
+    std::array<element, 2> add_sub(const element& other) const;
 
     element operator*(const Fr& other) const;
 
@@ -139,7 +140,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         bool is_element = false;
 
         chain_add_accumulator(){};
-        explicit chain_add_accumulator(element& input)
+        explicit chain_add_accumulator(const element& input)
         {
             x3_prev = input.x;
             y3_prev = input.y;
@@ -161,10 +162,8 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
 
     element montgomery_ladder(const element& other) const;
     element montgomery_ladder(const chain_add_accumulator& accumulator);
-    element double_montgomery_ladder(const element& add1, const element& add2) const;
-    element double_montgomery_ladder(const chain_add_accumulator& add1, const element& add2) const;
-    element double_montgomery_ladder(const chain_add_accumulator& add1, const chain_add_accumulator& add2) const;
-    element double_into_montgomery_ladder(const element& to_add) const;
+    element multiple_montgomery_ladder(const std::vector<chain_add_accumulator>& to_add) const;
+    element quadruple_and_add(const std::vector<element>& to_add) const;
 
     typename NativeGroup::affine_element get_value() const
     {
@@ -256,12 +255,13 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
     template <size_t num_elements,
               typename = typename std::enable_if<std::is_same<Composer, plonk::UltraComposer>::value>>
     static std::array<twin_rom_table<Composer>, 5> create_group_element_rom_tables(
-        const std::array<element, num_elements>& elements);
+        const std::array<element, num_elements>& elements, std::array<uint256_t, 8>& limb_max);
 
     template <size_t num_elements,
               typename = typename std::enable_if<std::is_same<Composer, plonk::UltraComposer>::value>>
     static element read_group_element_rom_tables(const std::array<twin_rom_table<Composer>, 5>& tables,
-                                                 const field_t<Composer>& index);
+                                                 const field_t<Composer>& index,
+                                                 const std::array<uint256_t, 8>& limb_max);
 
     static std::pair<element, element> compute_offset_generators(const size_t num_rounds);
 
@@ -277,6 +277,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         element operator[](const size_t idx) const { return element_table[idx]; }
         std::array<element, 16> element_table;
         std::array<twin_rom_table<Composer>, 5> coordinates;
+        std::array<uint256_t, 8> limb_max; // tracks the maximum limb size represented in each element_table entry
     };
 
     template <typename = typename std::enable_if<std::is_same<Composer, plonk::UltraComposer>::value>>
@@ -310,7 +311,6 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             P1.element_table[i] = P1.element_table[i - 1] + d2;
         }
         for (size_t i = 0; i < 8; ++i) {
-            // TODO: DO WE NEED TO REDUCE THESE ELEMENTS????
             P1.element_table[i] = (-P1.element_table[15 - i]);
         }
         for (size_t i = 0; i < 16; ++i) {
@@ -322,8 +322,8 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             endoP1.element_table[i].x = P1.element_table[i].x * beta;
             endoP1.element_table[15 - i].x = endoP1.element_table[i].x;
         }
-        P1.coordinates = create_group_element_rom_tables<16>(P1.element_table);
-        endoP1.coordinates = create_group_element_rom_tables<16>(endoP1.element_table);
+        P1.coordinates = create_group_element_rom_tables<16>(P1.element_table, P1.limb_max);
+        endoP1.coordinates = create_group_element_rom_tables<16>(endoP1.element_table, endoP1.limb_max);
         auto result = std::make_pair<four_bit_table_plookup<>, four_bit_table_plookup<>>(
             (four_bit_table_plookup<>)P1, (four_bit_table_plookup<>)endoP1);
         return result;
@@ -391,6 +391,7 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
 
         std::array<element, table_size> element_table;
         std::array<twin_rom_table<Composer>, 5> coordinates;
+        std::array<uint256_t, 8> limb_max;
     };
 
     using twin_lookup_table = typename std::
@@ -418,10 +419,10 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
                 endo_table.element_table[i + 8].x = base_table[7 - i].x * beta;
                 endo_table.element_table[i + 8].y = base_table[7 - i].y;
 
-                endo_table.element_table[7 - i] = (-endo_table.element_table[i + 8]).reduce();
+                endo_table.element_table[7 - i] = (-endo_table.element_table[i + 8]);
             }
 
-            endo_table.coordinates = create_group_element_rom_tables<16>(endo_table.element_table);
+            endo_table.coordinates = create_group_element_rom_tables<16>(endo_table.element_table, endo_table.limb_max);
         } else {
             std::array<element, 4> endo_inputs(inputs);
             for (auto& input : endo_inputs) {
@@ -451,10 +452,10 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
                 endo_table.element_table[i + 16].x = base_table[15 - i].x * beta;
                 endo_table.element_table[i + 16].y = base_table[15 - i].y;
 
-                endo_table.element_table[15 - i] = (-endo_table.element_table[i + 16]).reduce();
+                endo_table.element_table[15 - i] = (-endo_table.element_table[i + 16]);
             }
 
-            endo_table.coordinates = create_group_element_rom_tables<32>(endo_table.element_table);
+            endo_table.coordinates = create_group_element_rom_tables<32>(endo_table.element_table, endo_table.limb_max);
         }
         return std::make_pair<lookup_table_plookup<5>, lookup_table_plookup<5>>((lookup_table_plookup<5>)base_table,
                                                                                 (lookup_table_plookup<5>)endo_table);
@@ -472,11 +473,16 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             num_points = points.size();
             num_fives = num_points / 5;
 
+            // size-6 table is expensive and only benefits us if creating them reduces the number of total tables
             if (num_fives * 5 == (num_points - 1)) {
                 num_fives -= 1;
                 num_sixes = 1;
-            } else {
-                num_sixes = 0;
+            } else if (num_fives * 5 == (num_points - 2) && num_fives >= 2) {
+                num_fives -= 2;
+                num_sixes = 2;
+            } else if (num_fives * 5 == (num_points - 3) && num_fives >= 3) {
+                num_fives -= 3;
+                num_sixes = 3;
             }
 
             has_quad = ((num_fives * 5 + num_sixes * 6) < num_points - 3) && (num_points >= 4);
@@ -490,33 +496,40 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             has_singleton = num_points != ((num_fives * 5 + num_sixes * 6) + ((size_t)has_quad * 4) +
                                            ((size_t)has_triple * 3) + ((size_t)has_twin * 2));
 
+            size_t offset = 0;
+            for (size_t i = 0; i < num_sixes; ++i) {
+                six_tables.push_back(lookup_table_plookup<6>({
+                    points[offset + 6 * i],
+                    points[offset + 6 * i + 1],
+                    points[offset + 6 * i + 2],
+                    points[offset + 6 * i + 3],
+                    points[offset + 6 * i + 4],
+                    points[offset + 6 * i + 5],
+                }));
+            }
+            offset += 6 * num_sixes;
             for (size_t i = 0; i < num_fives; ++i) {
-                five_tables.push_back(lookup_table_plookup<5>(
-                    { points[5 * i], points[5 * i + 1], points[5 * i + 2], points[5 * i + 3], points[5 * i + 4] }));
-            }
-
-            if (num_sixes == 1) {
-                six_tables.push_back(lookup_table_plookup<6>({ points[5 * num_fives],
-                                                               points[5 * num_fives + 1],
-                                                               points[5 * num_fives + 2],
-                                                               points[5 * num_fives + 3],
-                                                               points[5 * num_fives + 4],
-                                                               points[5 * num_fives + 5] }));
+                five_tables.push_back(lookup_table_plookup<5>({
+                    points[offset + 5 * i],
+                    points[offset + 5 * i + 1],
+                    points[offset + 5 * i + 2],
+                    points[offset + 5 * i + 3],
+                    points[offset + 5 * i + 4],
+                }));
             }
+            offset += 5 * num_fives;
 
             if (has_quad) {
-                quad_tables.push_back(quad_lookup_table({ points[5 * num_fives],
-                                                          points[5 * num_fives + 1],
-                                                          points[5 * num_fives + 2],
-                                                          points[5 * num_fives + 3] }));
+                quad_tables.push_back(
+                    quad_lookup_table({ points[offset], points[offset + 1], points[offset + 2], points[offset + 3] }));
             }
 
             if (has_triple) {
-                triple_tables.push_back(triple_lookup_table(
-                    { points[5 * num_fives], points[5 * num_fives + 1], points[5 * num_fives + 2] }));
+                triple_tables.push_back(
+                    triple_lookup_table({ points[offset], points[offset + 1], points[offset + 2] }));
             }
             if (has_twin) {
-                twin_tables.push_back(twin_lookup_table({ points[5 * num_fives], points[5 * num_fives + 1] }));
+                twin_tables.push_back(twin_lookup_table({ points[offset], points[offset + 1] }));
             }
 
             if (has_singleton) {
@@ -587,37 +600,36 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         element::chain_add_accumulator get_chain_add_accumulator(std::vector<bool_t<Composer>>& naf_entries) const
         {
             std::vector<element> round_accumulator;
+            for (size_t j = 0; j < num_sixes; ++j) {
+                round_accumulator.push_back(six_tables[j].get({ naf_entries[6 * j],
+                                                                naf_entries[6 * j + 1],
+                                                                naf_entries[6 * j + 2],
+                                                                naf_entries[6 * j + 3],
+                                                                naf_entries[6 * j + 4],
+                                                                naf_entries[6 * j + 5] }));
+            }
+            size_t offset = num_sixes * 6;
             for (size_t j = 0; j < num_fives; ++j) {
-                round_accumulator.push_back(five_tables[j].get({ naf_entries[5 * j],
-                                                                 naf_entries[5 * j + 1],
-                                                                 naf_entries[5 * j + 2],
-                                                                 naf_entries[5 * j + 3],
-                                                                 naf_entries[5 * j + 4] }));
-            }
-
-            if (num_sixes == 1) {
-                round_accumulator.push_back(six_tables[0].get({ naf_entries[num_fives * 5],
-                                                                naf_entries[num_fives * 5 + 1],
-                                                                naf_entries[num_fives * 5 + 2],
-                                                                naf_entries[num_fives * 5 + 3],
-                                                                naf_entries[num_fives * 5 + 4],
-                                                                naf_entries[num_fives * 5 + 5] }));
+                round_accumulator.push_back(five_tables[j].get({ naf_entries[offset + j * 5],
+                                                                 naf_entries[offset + j * 5 + 1],
+                                                                 naf_entries[offset + j * 5 + 2],
+                                                                 naf_entries[offset + j * 5 + 3],
+                                                                 naf_entries[offset + j * 5 + 4] }));
             }
-
+            offset += num_fives * 5;
             if (has_quad) {
-                round_accumulator.push_back(quad_tables[0].get({ naf_entries[num_fives * 5],
-                                                                 naf_entries[num_fives * 5 + 1],
-                                                                 naf_entries[num_fives * 5 + 2],
-                                                                 naf_entries[num_fives * 5 + 3] }));
+                round_accumulator.push_back(quad_tables[0].get({ naf_entries[offset],
+                                                                 naf_entries[offset + 1],
+                                                                 naf_entries[offset + 2],
+                                                                 naf_entries[offset + 3] }));
             }
 
             if (has_triple) {
-                round_accumulator.push_back(triple_tables[0].get(
-                    { naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1], naf_entries[num_fives * 5 + 2] }));
+                round_accumulator.push_back(
+                    triple_tables[0].get({ naf_entries[offset], naf_entries[offset + 1], naf_entries[offset + 2] }));
             }
             if (has_twin) {
-                round_accumulator.push_back(
-                    twin_tables[0].get({ naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1] }));
+                round_accumulator.push_back(twin_tables[0].get({ naf_entries[offset], naf_entries[offset + 1] }));
             }
             if (has_singleton) {
                 round_accumulator.push_back(singletons[0].conditional_negate(naf_entries[num_points - 1]));
@@ -640,37 +652,37 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
         element get(std::vector<bool_t<Composer>>& naf_entries) const
         {
             std::vector<element> round_accumulator;
-            for (size_t j = 0; j < num_fives; ++j) {
-                round_accumulator.push_back(five_tables[j].get({ naf_entries[5 * j],
-                                                                 naf_entries[5 * j + 1],
-                                                                 naf_entries[5 * j + 2],
-                                                                 naf_entries[5 * j + 3],
-                                                                 naf_entries[5 * j + 4] }));
+            for (size_t j = 0; j < num_sixes; ++j) {
+                round_accumulator.push_back(six_tables[j].get({ naf_entries[6 * j],
+                                                                naf_entries[6 * j + 1],
+                                                                naf_entries[6 * j + 2],
+                                                                naf_entries[6 * j + 3],
+                                                                naf_entries[6 * j + 4],
+                                                                naf_entries[6 * j + 5] }));
             }
+            size_t offset = num_sixes * 6;
 
-            if (num_sixes == 1) {
-                round_accumulator.push_back(six_tables[0].get({ naf_entries[num_fives * 5],
-                                                                naf_entries[num_fives * 5 + 1],
-                                                                naf_entries[num_fives * 5 + 2],
-                                                                naf_entries[num_fives * 5 + 3],
-                                                                naf_entries[num_fives * 5 + 4],
-                                                                naf_entries[num_fives * 5 + 5] }));
+            for (size_t j = 0; j < num_fives; ++j) {
+                round_accumulator.push_back(five_tables[j].get({ naf_entries[offset + 5 * j],
+                                                                 naf_entries[offset + 5 * j + 1],
+                                                                 naf_entries[offset + 5 * j + 2],
+                                                                 naf_entries[offset + 5 * j + 3],
+                                                                 naf_entries[offset + 5 * j + 4] }));
             }
 
+            offset += num_fives * 5;
+
             if (has_quad) {
-                round_accumulator.push_back(quad_tables[0].get(naf_entries[num_fives * 5],
-                                                               naf_entries[num_fives * 5 + 1],
-                                                               naf_entries[num_fives * 5 + 2],
-                                                               naf_entries[num_fives * 5 + 3]));
+                round_accumulator.push_back(quad_tables[0].get(
+                    naf_entries[offset], naf_entries[offset + 1], naf_entries[offset + 2], naf_entries[offset + 3]));
             }
 
             if (has_triple) {
-                round_accumulator.push_back(triple_tables[0].get(
-                    naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1], naf_entries[num_fives * 5 + 2]));
+                round_accumulator.push_back(
+                    triple_tables[0].get(naf_entries[offset], naf_entries[offset + 1], naf_entries[offset + 2]));
             }
             if (has_twin) {
-                round_accumulator.push_back(
-                    twin_tables[0].get(naf_entries[num_fives * 5], naf_entries[num_fives * 5 + 1]));
+                round_accumulator.push_back(twin_tables[0].get(naf_entries[offset], naf_entries[offset + 1]));
             }
             if (has_singleton) {
                 round_accumulator.push_back(singletons[0].conditional_negate(naf_entries[num_points - 1]));
@@ -862,67 +874,6 @@ template <class Composer, class Fq, class Fr, class NativeGroup> class element {
             return element::chain_add_end(accumulator);
         }
 
-        // chain_add_accumulator get_chain_initial_entry() const
-        // {
-        //     std::vector<element> add_accumulator;
-        //     for (size_t i = 0; i < num_quads; ++i) {
-        //         add_accumulator.push_back(quad_tables[i][0]);
-        //     }
-        //     if (has_twin) {
-        //         add_accumulator.push_back(twin_tables[0][0]);
-        //     }
-        //     if (has_triple) {
-        //         add_accumulator.push_back(triple_tables[0][0]);
-        //     }
-        //     if (has_singleton) {
-        //         add_accumulator.push_back(singletons[0]);
-        //     }
-        //     if (add_accumulator.size() >= 2) {
-        //         chain_add_accumulator output = element::chain_add_start(add_accumulator[0], add_accumulator[1]);
-        //         for (size_t i = 2; i < add_accumulator.size(); ++i) {
-        //             output = element::chain_add(add_accumulator[i], output);
-        //         }
-        //         return output;
-        //     }
-        //     return chain_add_accumulator(add_accumulator[0]);
-        // }
-
-        // element::chain_add_accumulator get_chain_add_accumulator(std::vector<bool_t<Composer>>& naf_entries) const
-        // {
-        //     std::vector<element> round_accumulator;
-        //     for (size_t j = 0; j < num_quads; ++j) {
-        //         round_accumulator.push_back(quad_tables[j].get(
-        //             naf_entries[4 * j], naf_entries[4 * j + 1], naf_entries[4 * j + 2], naf_entries[4 * j + 3]));
-        //     }
-
-        //     if (has_triple) {
-        //         round_accumulator.push_back(triple_tables[0].get(
-        //             naf_entries[num_quads * 4], naf_entries[num_quads * 4 + 1], naf_entries[num_quads * 4 + 2]));
-        //     }
-        //     if (has_twin) {
-        //         round_accumulator.push_back(
-        //             twin_tables[0].get(naf_entries[num_quads * 4], naf_entries[num_quads * 4 + 1]));
-        //     }
-        //     if (has_singleton) {
-        //         round_accumulator.push_back(singletons[0].conditional_negate(naf_entries[num_points - 1]));
-        //     }
-
-        //     element::chain_add_accumulator accumulator;
-        //     if (round_accumulator.size() == 1) {
-        //         accumulator.x3_prev = round_accumulator[0].x;
-        //         accumulator.y3_prev = round_accumulator[0].y;
-        //         accumulator.is_element = true;
-        //         return accumulator;
-        //     } else if (round_accumulator.size() == 2) {
-        //         return element::chain_add_start(round_accumulator[0], round_accumulator[1]);
-        //     } else {
-        //         accumulator = element::chain_add_start(round_accumulator[0], round_accumulator[1]);
-        //         for (size_t j = 2; j < round_accumulator.size(); ++j) {
-        //             accumulator = element::chain_add(round_accumulator[j], accumulator);
-        //         }
-        //     }
-        //     return (accumulator);
-        // }
         std::vector<quad_lookup_table> quad_tables;
         std::vector<triple_lookup_table> triple_tables;
         std::vector<twin_lookup_table> twin_tables;
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp
index e3b6576e9f..85b4e2325c 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup.test.cpp
@@ -415,7 +415,7 @@ template <typename TestType> class stdlib_biggroup : public testing::Test {
         EXPECT_VERIFICATION(composer);
     }
 
-    static void test_double_montgomery_ladder()
+    static void test_multiple_montgomery_ladder()
     {
         Composer composer = Composer();
         size_t num_repetitions = 10;
@@ -423,19 +423,17 @@ template <typename TestType> class stdlib_biggroup : public testing::Test {
             affine_element acc_small(element::random_element());
             element_ct acc_big = element_ct::from_witness(&composer, acc_small);
 
-            affine_element add_1_small_0(element::random_element());
-            element_ct add_1_big_0 = element_ct::from_witness(&composer, add_1_small_0);
-            affine_element add_2_small_0(element::random_element());
-            element_ct add_2_big_0 = element_ct::from_witness(&composer, add_2_small_0);
-
-            affine_element add_1_small_1(element::random_element());
-            element_ct add_1_big_1 = element_ct::from_witness(&composer, add_1_small_1);
-            affine_element add_2_small_1(element::random_element());
-            element_ct add_2_big_1 = element_ct::from_witness(&composer, add_2_small_1);
-
-            typename element_ct::chain_add_accumulator add_1 = element_ct::chain_add_start(add_1_big_0, add_1_big_1);
-            typename element_ct::chain_add_accumulator add_2 = element_ct::chain_add_start(add_2_big_0, add_2_big_1);
-            acc_big.double_montgomery_ladder(add_1, add_2);
+            std::vector<typename element_ct::chain_add_accumulator> to_add;
+            for (size_t j = 0; j < i; ++j) {
+                affine_element add_1_small_0(element::random_element());
+                element_ct add_1_big_0 = element_ct::from_witness(&composer, add_1_small_0);
+                affine_element add_2_small_0(element::random_element());
+                element_ct add_2_big_0 = element_ct::from_witness(&composer, add_2_small_0);
+                typename element_ct::chain_add_accumulator add_1 =
+                    element_ct::chain_add_start(add_1_big_0, add_2_big_0);
+                to_add.emplace_back(add_1);
+            }
+            acc_big.multiple_montgomery_ladder(to_add);
         }
 
         EXPECT_VERIFICATION(composer);
@@ -890,10 +888,10 @@ HEAVY_TYPED_TEST(stdlib_biggroup, chain_add)
 
     TestFixture::test_chain_add();
 }
-HEAVY_TYPED_TEST(stdlib_biggroup, double_montgomery_ladder)
+HEAVY_TYPED_TEST(stdlib_biggroup, multiple_montgomery_ladder)
 {
 
-    TestFixture::test_double_montgomery_ladder();
+    TestFixture::test_multiple_montgomery_ladder();
 }
 
 HEAVY_TYPED_TEST(stdlib_biggroup, compute_naf)
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp
index c237d0dfbb..ebeb0aee5b 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_batch_mul.hpp
@@ -41,14 +41,11 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::wnaf_batch_mul(const std::vector<el
     for (size_t i = 1; i < num_rounds; ++i) {
         accumulator = accumulator.dbl();
         accumulator = accumulator.dbl();
-
-        element to_add = point_tables[0][wnaf_entries[0][i]];
-        for (size_t j = 1; j < points.size(); ++j) {
-            to_add += point_tables[j][wnaf_entries[j][i]];
+        std::vector<element> to_add;
+        for (size_t j = 0; j < points.size(); ++j) {
+            to_add.emplace_back(point_tables[j][wnaf_entries[j][i]]);
         }
-        // accumulator = accumulator.dbl();
-        // accumulator = accumulator.montgomery_ladder(to_add);
-        accumulator = accumulator.double_into_montgomery_ladder(to_add);
+        accumulator = accumulator.quadruple_and_add(to_add);
     }
 
     for (size_t i = 0; i < points.size(); ++i) {
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp
index 834a1b01ef..924cd0d7ec 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_bn254.hpp
@@ -117,34 +117,45 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul_with_generator
             return to_add;
         };
 
-        for (size_t i = 1; i < num_rounds / 2; ++i) {
+        // Perform multiple rounds of the montgomery ladder algoritm per "iteration" of our main loop.
+        // This is in order to reduce the number of field reductions required when calling `multiple_montgomery_ladder`
+        constexpr size_t num_rounds_per_iteration = 4;
 
-            auto add_1 = get_point_to_add(i * 2 - 1);
-            auto add_2 = get_point_to_add(i * 2);
+        // we require that we perform max of one generator per iteration
+        static_assert(num_rounds_per_iteration < 8);
 
-            // TODO update this to work if num_bits is odd
-            if ((i * 2) % 8 == 0) {
-                add_1 = element::chain_add(generator_table[generator_wnaf[(i * 2 - 8) / 8]], add_1);
-                add_1 = element::chain_add(generator_endo_table[generator_endo_wnaf[(i * 2 - 8) / 8]], add_1);
-            }
-            if (!add_1.is_element) {
-                accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-            } else {
-                accumulator = accumulator.double_montgomery_ladder(element(add_1.x3_prev, add_1.y3_prev),
-                                                                   element(add_2.x3_prev, add_2.y3_prev));
+        size_t num_iterations = num_rounds / num_rounds_per_iteration;
+        num_iterations += ((num_iterations * num_rounds_per_iteration) == num_rounds) ? 0 : 1;
+        const size_t num_rounds_per_final_iteration =
+            (num_rounds - 1) - ((num_iterations - 1) * num_rounds_per_iteration);
+
+        size_t generator_idx = 0;
+        for (size_t i = 0; i < num_iterations; ++i) {
+
+            const size_t inner_num_rounds =
+                (i != num_iterations - 1) ? num_rounds_per_iteration : num_rounds_per_final_iteration;
+            std::vector<element::chain_add_accumulator> to_add;
+
+            for (size_t j = 0; j < inner_num_rounds; ++j) {
+                to_add.emplace_back(get_point_to_add(i * num_rounds_per_iteration + j + 1));
             }
-        }
 
-        if ((num_rounds & 0x01ULL) == 0x00ULL) {
-            auto add_1 = get_point_to_add(num_rounds - 1);
-            add_1 = element::chain_add(generator_table[generator_wnaf[generator_wnaf.size() - 2]], add_1);
-            add_1 = element::chain_add(generator_endo_table[generator_endo_wnaf[generator_wnaf.size() - 2]], add_1);
-            if (add_1.is_element) {
-                element temp(add_1.x3_prev, add_1.y3_prev);
-                accumulator = accumulator.montgomery_ladder(temp);
-            } else {
-                accumulator = accumulator.montgomery_ladder(add_1);
+            bool add_generator_this_round = false;
+            size_t add_idx = 0;
+            for (size_t j = 0; j < inner_num_rounds; ++j) {
+                add_generator_this_round = ((i * num_rounds_per_iteration + j) % 8) == 6;
+                if (add_generator_this_round) {
+                    add_idx = j;
+                    break;
+                }
+            }
+            if (add_generator_this_round) {
+                to_add[add_idx] = element::chain_add(generator_table[generator_wnaf[generator_idx]], to_add[add_idx]);
+                to_add[add_idx] =
+                    element::chain_add(generator_endo_table[generator_endo_wnaf[generator_idx]], to_add[add_idx]);
+                generator_idx++;
             }
+            accumulator = accumulator.multiple_montgomery_ladder(to_add);
         }
 
         for (size_t i = 0; i < small_points.size(); ++i) {
@@ -333,12 +344,12 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul(const std::vec
      * 1. Extract NAF value for bit `2*i - 1` for each scalar multiplier and store in `nafs` vector.
      * 2. Use `nafs` vector to derive the point that we need (`add_1`) to add into our accumulator.
      * 3. Repeat the above 2 steps but for bit `2 * i` (`add_2`)
-     * 4. Compute `accumulator = 4 * accumulator + 2 * add_1 + add_2` using `double_montgomery_ladder` method
+     * 4. Compute `accumulator = 4 * accumulator + 2 * add_1 + add_2` using `multiple_montgomery_ladder` method
      *
      * The purpose of the above is to minimize the number of required range checks (vs a simple double and add algo).
      *
-     * When computing two iterations of the montgomery ladder algorithm, we can neglect computing the y-coordinate of
-     *the 1st ladder output. See `double_montgomery_ladder` for more details.
+     * When computing repeated iterations of the montgomery ladder algorithm, we can neglect computing the y-coordinate
+     *of each ladder output. See `multiple_montgomery_ladder` for more details.
      **/
     for (size_t i = 1; i < num_rounds / 2; ++i) {
         // `nafs` tracks the naf value for each point for the current round
@@ -365,14 +376,8 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul(const std::vec
         }
         element::chain_add_accumulator add_2 = point_table.get_chain_add_accumulator(nafs);
 
-        // Perform the double montgomery ladder. We need to convert our chain_add_accumulator types into regular
-        // elements if the accumuator does not contain a y-coordinate
-        if (!add_1.is_element) {
-            accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-        } else {
-            accumulator = accumulator.double_montgomery_ladder(element(add_1.x3_prev, add_1.y3_prev),
-                                                               element(add_2.x3_prev, add_2.y3_prev));
-        }
+        // Perform the double montgomery ladder.
+        accumulator = accumulator.multiple_montgomery_ladder({ add_1, add_2 });
     }
 
     // we need to iterate 1 more time if the number of rounds is even
@@ -382,12 +387,7 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::bn254_endo_batch_mul(const std::vec
             nafs.emplace_back(naf_entries[j][num_rounds - 1]);
         }
         element::chain_add_accumulator add_1 = point_table.get_chain_add_accumulator(nafs);
-        if (add_1.is_element) {
-            element temp(add_1.x3_prev, add_1.y3_prev);
-            accumulator = accumulator.montgomery_ladder(temp);
-        } else {
-            accumulator = accumulator.montgomery_ladder(add_1);
-        }
+        accumulator = accumulator.multiple_montgomery_ladder({ add_1 });
     }
 
     /**
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp
index 78d6f37855..bff1805db3 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_impl.hpp
@@ -80,6 +80,39 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::operator-(const element& other) con
 
     return element(x_3, y_3);
 }
+
+/**
+ * @brief Compute (*this) + other AND (*this) - other as a size-2 array
+ *
+ * @details We require this operation when computing biggroup lookup tables for
+ *          multi-scalar-multiplication. This combined method reduces the number of
+ *          field additions, field subtractions required (as well as 1 less assert_is_not_equal check)
+ *
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @param other
+ * @return std::array<element<C, Fq, Fr, G>, 2>
+ */
+template <typename C, class Fq, class Fr, class G>
+std::array<element<C, Fq, Fr, G>, 2> element<C, Fq, Fr, G>::add_sub(const element& other) const
+{
+    other.x.assert_is_not_equal(x);
+
+    const Fq denominator = other.x - x;
+    const Fq x2x1 = -(other.x + x);
+
+    const Fq lambda1 = Fq::div_without_denominator_check({ other.y, -y }, denominator);
+    const Fq x_3 = lambda1.sqradd({ x2x1 });
+    const Fq y_3 = lambda1.madd(x - x_3, { -y });
+    const Fq lambda2 = Fq::div_without_denominator_check({ -other.y, -y }, denominator);
+    const Fq x_4 = lambda2.sqradd({ x2x1 });
+    const Fq y_4 = lambda2.madd(x - x_4, { -y });
+
+    return { element(x_3, y_3), element(x_4, y_4) };
+}
+
 template <typename C, class Fq, class Fr, class G> element<C, Fq, Fr, G> element<C, Fq, Fr, G>::dbl() const
 {
     Fq two_x = x + x;
@@ -294,200 +327,217 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::montgomery_ladder(const chain_add_a
 }
 
 /**
- * Compute (4 * (*this)) + (2 * add1) + add2
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction.
+ * @brief Compute 4.P + to_add[0] + ... + to_add[to_add.size() - 1]
  *
- * Total number of field reductions = 9
+ * @details Used in wnaf_batch_mul method. Combining operations requires fewer bigfield reductions.
  *
- * Two calls to mont ladder woud require 10
+ * Method computes R[i] = (2P + A[0]) + (2P + A[1]) + A[2] + ... + A[n-1]
  *
- * Using doublings and additions would require 12!
- **/
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @param to_add
+ * @return element<C, Fq, Fr, G>
+ */
 template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_montgomery_ladder(const element& add1, const element& add2) const
+element<C, Fq, Fr, G> element<C, Fq, Fr, G>::quadruple_and_add(const std::vector<element>& to_add) const
 {
-    add1.x.assert_is_not_equal(x);
-    const Fq lambda_1 = Fq::div_without_denominator_check({ add1.y, -y }, (add1.x - x));
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x, -x });
-
-    const Fq minus_lambda_2 =
-        lambda_1 + Fq::div_without_denominator_check({ y + y }, (x_3 - x)); // (y + y) / (x_3 - x);
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x, -x_3 });
-
-    // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where needed.
-    // This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a field division)
-    // with only one non-native field reduction.
-    // E.g. evaluating (a * b) + (c * d) = e mod p only requires 1 quotient and remainder.
-    // Defining the quotient and remainder elements is the major cost of a non-native field multiplication
-    // because each requires ~256 bits of range checks
-    const Fq x_sub_x4 = x - x_4;
-
-    const Fq x4_sub_add2x = x_4 - add2.x;
+    const Fq two_x = x + x;
+    Fq x_1;
+    Fq minus_lambda_dbl;
+    if constexpr (G::has_a) {
+        Fq a(get_context(), uint256_t(G::curve_a));
+        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), { a });
+        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
+    } else {
+        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), {});
+        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
+    }
 
-    // msub_div; 'compute a multiplication and a division and multiply the two together. Requires only 1 non native
-    // field reduction`
-    const Fq lambda_3 = Fq::msub_div({ minus_lambda_2 }, { (x_sub_x4) }, (x4_sub_add2x), { y, add2.y });
+    ASSERT(to_add.size() > 0);
+    to_add[0].x.assert_is_not_equal(x_1);
 
-    // validate we can use incomplete addition formulae
-    x_4.assert_is_not_equal(add2.x);
+    const Fq x_minus_x_1 = x - x_1;
 
-    const Fq x_5 = lambda_3.sqradd({ -x_4, -add2.x });
-    const Fq x5_sub_x4 = x_5 - x_4;
+    const Fq lambda_1 = Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_1 - to_add[0].x), { to_add[0].y, y });
 
-    const Fq half_minus_lambda_4_minus_lambda_3 = Fq::msub_div({ minus_lambda_2 }, { x_sub_x4 }, (x5_sub_x4), { y });
+    const Fq x_3 = lambda_1.sqradd({ -to_add[0].x, -x_1 });
 
-    const Fq minus_lambda_4_minus_lambda_3 = half_minus_lambda_4_minus_lambda_3 + half_minus_lambda_4_minus_lambda_3;
-    const Fq minus_lambda_4 = minus_lambda_4_minus_lambda_3 + lambda_3;
-    const Fq x_6 = minus_lambda_4.sqradd({ -x_4, -x_5 });
+    const Fq half_minus_lambda_2_minus_lambda_1 =
+        Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_3 - x_1), { y });
 
-    const Fq x6_sub_x4 = x_6 - x_4;
+    const Fq minus_lambda_2_minus_lambda_1 = half_minus_lambda_2_minus_lambda_1 + half_minus_lambda_2_minus_lambda_1;
+    const Fq minus_lambda_2 = minus_lambda_2_minus_lambda_1 + lambda_1;
 
-    // y_6 = -L_4 * (x_6 - x_4) - L_2 * (x - x_4) + y
-    const Fq y_6 = Fq::dual_madd(minus_lambda_4, (x6_sub_x4), minus_lambda_2, x_sub_x4, { y });
+    const Fq x_4 = minus_lambda_2.sqradd({ -x_1, -x_3 });
 
-    return element(x_6, y_6);
-}
+    const Fq x_4_sub_x_1 = x_4 - x_1;
 
-/**
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction
- *
- **/
-template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_montgomery_ladder(const chain_add_accumulator& add1,
-                                                                      const element& add2) const
-{
-    if (add1.is_element) {
-        throw_or_abort("An accumulator expected");
+    if (to_add.size() == 1) {
+        const Fq y_4 = Fq::dual_madd(minus_lambda_2, x_4_sub_x_1, minus_lambda_dbl, x_minus_x_1, { y });
+        return element(x_4, y_4);
     }
-    add1.x3_prev.assert_is_not_equal(x);
-    Fq lambda_1 = Fq::msub_div(
-        { add1.lambda_prev }, { (add1.x1_prev - add1.x3_prev) }, (x - add1.x3_prev), { -add1.y1_prev, -y });
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x3_prev, -x });
-
-    const Fq minus_lambda_2 =
-        lambda_1 + Fq::div_without_denominator_check({ y + y }, (x_3 - x)); // (y + y) / (x_3 - x);
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x, -x_3 });
-
-    // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where needed.
-    // This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a field division)
-    // with only one non-native field reduction.
-    // E.g. evaluating (a * b) + (c * d) = e mod p only requires 1 quotient and remainder, which is the major cost
-    // of a non-native field multiplication
-    const Fq x_sub_x4 = x - x_4;
+    to_add[1].x.assert_is_not_equal(to_add[0].x);
 
-    const Fq x4_sub_add2x = x_4 - add2.x;
-    const Fq lambda_3 = Fq::msub_div({ minus_lambda_2 }, { (x_sub_x4) }, (x4_sub_add2x), { y, add2.y });
+    Fq minus_lambda_3 = Fq::msub_div(
+        { minus_lambda_dbl, minus_lambda_2 }, { x_minus_x_1, x_4_sub_x_1 }, (x_4 - to_add[1].x), { y, -(to_add[1].y) });
 
-    x_4.assert_is_not_equal(add2.x);
+    // X5 = L3.L3 - X4 - XB
+    const Fq x_5 = minus_lambda_3.sqradd({ -x_4, -to_add[1].x });
 
-    const Fq x_5 = lambda_3.sqradd({ -x_4, -add2.x });
-    const Fq x5_sub_x4 = x_5 - x_4;
+    if (to_add.size() == 2) {
+        // Y5 = L3.(XB - X5) - YB
+        const Fq y_5 = minus_lambda_3.madd(x_5 - to_add[1].x, { -to_add[1].y });
+        return element(x_5, y_5);
+    }
 
-    const Fq half_minus_lambda_4_minus_lambda_3 = Fq::msub_div({ minus_lambda_2 }, { x_sub_x4 }, (x5_sub_x4), { y });
+    Fq x_prev = x_5;
+    Fq minus_lambda_prev = minus_lambda_3;
 
-    const Fq minus_lambda_4_minus_lambda_3 = half_minus_lambda_4_minus_lambda_3 + half_minus_lambda_4_minus_lambda_3;
-    const Fq minus_lambda_4 = minus_lambda_4_minus_lambda_3 + lambda_3;
-    const Fq x_6 = minus_lambda_4.sqradd({ -x_4, -x_5 });
+    for (size_t i = 2; i < to_add.size(); ++i) {
 
-    const Fq x6_sub_x4 = x_6 - x_4;
+        to_add[i].x.assert_is_not_equal(to_add[i - 1].x);
+        // Lambda = Yprev - Yadd[i] / Xprev - Xadd[i]
+        //        = -Lprev.(Xprev - Xadd[i-1]) - Yadd[i - 1] - Yadd[i] / Xprev - Xadd[i]
+        const Fq minus_lambda = Fq::msub_div({ minus_lambda_prev },
+                                             { to_add[i - 1].x - x_prev },
+                                             (to_add[i].x - x_prev),
+                                             { to_add[i - 1].y, to_add[i].y });
+        // X = Lambda * Lambda - Xprev - Xadd[i]
+        const Fq x_out = minus_lambda.sqradd({ -x_prev, -to_add[i].x });
 
-    const Fq y_6 = Fq::dual_madd(minus_lambda_4, (x6_sub_x4), minus_lambda_2, x_sub_x4, { y });
+        x_prev = x_out;
+        minus_lambda_prev = minus_lambda;
+    }
+    const Fq y_out = minus_lambda_prev.madd(x_prev - to_add[to_add.size() - 1].x, { -to_add[to_add.size() - 1].y });
 
-    return element(x_6, y_6);
+    return element(x_prev, y_out);
 }
 
 /**
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction
+ * @brief Perform repeated iterations of the montgomery ladder algorithm.
  *
- **/
+ * For points P, Q, montgomery ladder computes R = (P + Q) + P
+ * i.e. it's "double-and-add" without explicit doublings.
+ *
+ * This method can apply repeated iterations of the montgomery ladder.
+ * Each iteration reduces the number of field multiplications by 1, at the cost of more additions.
+ * (i.e. we don't compute intermediate y-coordinates).
+ *
+ * The number of additions scales with the size of the input vector. The optimal input size appears to be 4.
+ *
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @param add
+ * @return element<C, Fq, Fr, G>
+ */
 template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_montgomery_ladder(const chain_add_accumulator& add1,
-                                                                      const chain_add_accumulator& add2) const
+element<C, Fq, Fr, G> element<C, Fq, Fr, G>::multiple_montgomery_ladder(
+    const std::vector<chain_add_accumulator>& add) const
 {
-    if ((add1.is_element) || (add2.is_element)) {
-        throw_or_abort("An accumulator expected");
-    }
-    add1.x3_prev.assert_is_not_equal(x);
-    // add1.y = lambda_prev * (x1_prev - x3_prev) - y1_prev
-    Fq lambda_1 = Fq::msub_div(
-        { add1.lambda_prev }, { (add1.x1_prev - add1.x3_prev) }, (x - add1.x3_prev), { -add1.y1_prev, -y });
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x3_prev, -x });
-
-    const Fq minus_lambda_2 =
-        lambda_1 + Fq::div_without_denominator_check({ y + y }, (x_3 - x)); // (y + y) / (x_3 - x);
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x, -x_3 });
-
-    // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where needed.
-    // This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a field division)
-    // with only one non-native field reduction.
-    // E.g. evaluating (a * b) + (c * d) = e mod p only requires 1 quotient and remainder, which is the major cost
-    // of a non-native field multiplication
-    const Fq x_sub_x4 = x - x_4;
-
-    const Fq x4_sub_add2x = x_4 - add2.x3_prev;
-
-    const Fq lambda_3 = Fq::msub_div({ minus_lambda_2, add2.lambda_prev },
-                                     { (x_sub_x4), (add2.x1_prev - add2.x3_prev) },
-                                     (x4_sub_add2x),
-                                     { y, -add2.y1_prev });
-
-    x_4.assert_is_not_equal(add2.x3_prev);
-
-    const Fq x_5 = lambda_3.sqradd({ -x_4, -add2.x3_prev });
-    const Fq x5_sub_x4 = x_5 - x_4;
-
-    const Fq half_minus_lambda_4_minus_lambda_3 = Fq::msub_div({ minus_lambda_2 }, { x_sub_x4 }, (x5_sub_x4), { y });
+    struct composite_y {
+        std::vector<Fq> mul_left;
+        std::vector<Fq> mul_right;
+        std::vector<Fq> add;
+        bool is_negative = false;
+    };
+
+    Fq previous_x = x;
+    composite_y previous_y{ std::vector<Fq>(), std::vector<Fq>(), std::vector<Fq>(), false };
+    for (size_t i = 0; i < add.size(); ++i) {
+        previous_x.assert_is_not_equal(add[i].x3_prev);
+
+        // composite_y add_y;
+        bool negate_add_y = (i > 0) && !previous_y.is_negative;
+        std::vector<Fq> lambda1_left;
+        std::vector<Fq> lambda1_right;
+        std::vector<Fq> lambda1_add;
+
+        if (i == 0) {
+            lambda1_add.emplace_back(-y);
+        } else {
+            lambda1_left = previous_y.mul_left;
+            lambda1_right = previous_y.mul_right;
+            lambda1_add = previous_y.add;
+        }
 
-    const Fq minus_lambda_4_minus_lambda_3 = half_minus_lambda_4_minus_lambda_3 + half_minus_lambda_4_minus_lambda_3;
-    const Fq minus_lambda_4 = minus_lambda_4_minus_lambda_3 + lambda_3;
-    const Fq x_6 = minus_lambda_4.sqradd({ -x_4, -x_5 });
+        if (!add[i].is_element) {
+            lambda1_left.emplace_back(add[i].lambda_prev);
+            lambda1_right.emplace_back(negate_add_y ? add[i].x3_prev - add[i].x1_prev
+                                                    : add[i].x1_prev - add[i].x3_prev);
+            lambda1_add.emplace_back(negate_add_y ? add[i].y1_prev : -add[i].y1_prev);
+        } else if (i > 0) {
+            lambda1_add.emplace_back(negate_add_y ? -add[i].y3_prev : add[i].y3_prev);
+        }
+        // if previous_y is negated then add stays positive
+        // if previous_y is positive then add stays negated
+        // | add.y is negated | previous_y is negated | output of msub_div is -lambda |
+        // | --- | --- | --- |
+        // | no  | yes | yes |
+        // | yes | no  | no  |
+
+        Fq lambda1;
+        if (!add[i].is_element || i > 0) {
+            bool flip_lambda1_denominator = !negate_add_y;
+            Fq denominator = flip_lambda1_denominator ? previous_x - add[i].x3_prev : add[i].x3_prev - previous_x;
+            lambda1 = Fq::msub_div(lambda1_left, lambda1_right, denominator, lambda1_add);
+        } else {
+            lambda1 = Fq::div_without_denominator_check({ add[i].y3_prev - y }, (add[i].x3_prev - x));
+        }
 
-    const Fq x6_sub_x4 = x_6 - x_4;
+        Fq x_3 = lambda1.madd(lambda1, { -add[i].x3_prev, -previous_x });
 
-    const Fq y_6 = Fq::dual_madd(minus_lambda_4, (x6_sub_x4), minus_lambda_2, x_sub_x4, { y });
+        // We can avoid computing y_4, instead substituting the expression `minus_lambda_2 * (x_4 - x) - y` where
+        // needed. This is cheaper, because we can evaluate two field multiplications (or a field multiplication + a
+        // field division) with only one non-native field reduction. E.g. evaluating (a * b) + (c * d) = e mod p only
+        // requires 1 quotient and remainder, which is the major cost of a non-native field multiplication
+        Fq lambda2;
+        if (i == 0) {
+            lambda2 = Fq::div_without_denominator_check({ y + y }, (previous_x - x_3)) - lambda1;
+        } else {
+            Fq l2_denominator = previous_y.is_negative ? previous_x - x_3 : x_3 - previous_x;
+            Fq partial_lambda2 =
+                Fq::msub_div(previous_y.mul_left, previous_y.mul_right, l2_denominator, previous_y.add);
+            partial_lambda2 = partial_lambda2 + partial_lambda2;
+            lambda2 = partial_lambda2 - lambda1;
+        }
 
-    return element(x_6, y_6);
-}
-/**
- * If we chain two iterations of the montgomery ladder together, we can squeeze out a non-native field reduction
- **/
-template <typename C, class Fq, class Fr, class G>
-element<C, Fq, Fr, G> element<C, Fq, Fr, G>::double_into_montgomery_ladder(const element& add1) const
-{
-    const Fq two_x = x + x;
-    Fq x_1;
-    Fq minus_lambda_dbl;
-    if constexpr (G::has_a) {
-        Fq a(get_context(), uint256_t(G::curve_a));
-        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), { a });
-        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
-    } else {
-        minus_lambda_dbl = Fq::msub_div({ x }, { (two_x + x) }, (y + y), {});
-        x_1 = minus_lambda_dbl.sqradd({ -(two_x) });
+        Fq x_4 = lambda2.sqradd({ -x_3, -previous_x });
+        composite_y y_4;
+        if (i == 0) {
+            // We want to make sure that at the final iteration, `y_previous.is_negative = false`
+            // Each iteration flips the sign of y_previous.is_negative.
+            // i.e. whether we store y_4 or -y_4 depends on the number of points we have
+            bool num_points_even = ((add.size() & 0x01UL) == 0);
+            y_4.add.emplace_back(num_points_even ? y : -y);
+            y_4.mul_left.emplace_back(lambda2);
+            y_4.mul_right.emplace_back(num_points_even ? x_4 - previous_x : previous_x - x_4);
+            y_4.is_negative = num_points_even;
+        } else {
+            y_4.is_negative = !previous_y.is_negative;
+            y_4.mul_left.emplace_back(lambda2);
+            y_4.mul_right.emplace_back(previous_y.is_negative ? previous_x - x_4 : x_4 - previous_x);
+            // append terms in previous_y to y_4. We want to make sure the terms above are added into the start of y_4.
+            // This is to ensure they are cached correctly when
+            // `composer::evaluate_partial_non_native_field_multiplication` is called.
+            // (the 1st mul_left, mul_right elements will trigger composer::evaluate_non_native_field_multiplication
+            //  when Fq::mult_madd is called - this term cannot be cached so we want to make sure it is unique)
+            std::copy(previous_y.mul_left.begin(), previous_y.mul_left.end(), std::back_inserter(y_4.mul_left));
+            std::copy(previous_y.mul_right.begin(), previous_y.mul_right.end(), std::back_inserter(y_4.mul_right));
+            std::copy(previous_y.add.begin(), previous_y.add.end(), std::back_inserter(y_4.add));
+        }
+        previous_x = x_4;
+        previous_y = y_4;
     }
+    Fq x_out = previous_x;
 
-    add1.x.assert_is_not_equal(x_1);
-
-    const Fq x_minus_x_1 = x - x_1;
-    const Fq lambda_1 = Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_1 - add1.x), { add1.y, y });
-
-    const Fq x_3 = lambda_1.sqradd({ -add1.x, -x_1 });
-    const Fq half_minus_lambda_2_minus_lambda_1 =
-        Fq::msub_div({ minus_lambda_dbl }, { x_minus_x_1 }, (x_3 - x_1), { y });
-    const Fq minus_lambda_2_minus_lambda_1 = half_minus_lambda_2_minus_lambda_1 + half_minus_lambda_2_minus_lambda_1;
-    const Fq minus_lambda_2 = minus_lambda_2_minus_lambda_1 + lambda_1;
-
-    const Fq x_4 = minus_lambda_2.sqradd({ -x_1, -x_3 });
-
-    const Fq y_4 = Fq::dual_madd(minus_lambda_2, (x_4 - x_1), minus_lambda_dbl, x_minus_x_1, { y });
+    ASSERT(!previous_y.is_negative);
 
-    return element(x_4, y_4);
+    Fq y_out = Fq::mult_madd(previous_y.mul_left, previous_y.mul_right, previous_y.add);
+    return element(x_out, y_out);
 }
 
 /**
@@ -551,6 +601,7 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::batch_mul(const std::vector<element
                                                        const std::vector<Fr>& scalars,
                                                        const size_t max_num_bits)
 {
+
     const size_t num_points = points.size();
     ASSERT(scalars.size() == num_points);
     batch_lookup_table point_table(points);
@@ -563,38 +614,25 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::batch_mul(const std::vector<element
     const auto offset_generators = compute_offset_generators(num_rounds);
     element accumulator =
         element::chain_add_end(element::chain_add(offset_generators.first, point_table.get_chain_initial_entry()));
-    for (size_t i = 1; i < num_rounds / 2; ++i) {
-        std::vector<bool_t<C>> nafs;
-        for (size_t j = 0; j < num_points; ++j) {
-            nafs.emplace_back(naf_entries[j][i * 2 - 1]);
-        }
-        element::chain_add_accumulator add_1 = point_table.get_chain_add_accumulator(nafs);
-        for (size_t j = 0; j < num_points; ++j) {
-            nafs[j] = (naf_entries[j][i * 2]);
-        }
-        element::chain_add_accumulator add_2 = point_table.get_chain_add_accumulator(nafs);
 
-        if (!add_1.is_element) {
-            accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-        } else {
-            accumulator = accumulator.double_montgomery_ladder(element(add_1.x3_prev, add_1.y3_prev),
-                                                               element(add_2.x3_prev, add_2.y3_prev));
+    constexpr size_t num_rounds_per_iteration = 4;
+    size_t num_iterations = num_rounds / num_rounds_per_iteration;
+    num_iterations += ((num_iterations * num_rounds_per_iteration) == num_rounds) ? 0 : 1;
+    const size_t num_rounds_per_final_iteration = (num_rounds - 1) - ((num_iterations - 1) * num_rounds_per_iteration);
+    for (size_t i = 0; i < num_iterations; ++i) {
+
+        std::vector<bool_t<C>> nafs(num_points);
+        std::vector<element::chain_add_accumulator> to_add;
+        const size_t inner_num_rounds =
+            (i != num_iterations - 1) ? num_rounds_per_iteration : num_rounds_per_final_iteration;
+        for (size_t j = 0; j < inner_num_rounds; ++j) {
+            for (size_t k = 0; k < num_points; ++k) {
+                nafs[k] = (naf_entries[k][i * num_rounds_per_iteration + j + 1]);
+            }
+            to_add.emplace_back(point_table.get_chain_add_accumulator(nafs));
         }
+        accumulator = accumulator.multiple_montgomery_ladder(to_add);
     }
-    if ((num_rounds & 0x01ULL) == 0x00ULL) {
-        std::vector<bool_t<C>> nafs;
-        for (size_t j = 0; j < points.size(); ++j) {
-            nafs.emplace_back(naf_entries[j][num_rounds - 1]);
-        }
-        element::chain_add_accumulator add_1 = point_table.get_chain_add_accumulator(nafs);
-        if (add_1.is_element) {
-            element temp(add_1.x3_prev, add_1.y3_prev);
-            accumulator = accumulator.montgomery_ladder(temp);
-        } else {
-            accumulator = accumulator.montgomery_ladder(add_1);
-        }
-    }
-
     for (size_t i = 0; i < num_points; ++i) {
         element skew = accumulator - points[i];
         Fq out_x = accumulator.x.conditional_select(skew.x, naf_entries[i][num_rounds]);
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp
index 63257d5687..49e43ba6e9 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_nafs.hpp
@@ -243,9 +243,15 @@ typename element<C, Fq, Fr, G>::secp256k1_wnaf_pair element<C, Fq, Fr, G>::compu
         // Compute and constrain skews
         field_t<C> negative_skew = witness_t<C>(ctx, is_negative ? 0 : skew);
         field_t<C> positive_skew = witness_t<C>(ctx, is_negative ? skew : 0);
-        negative_skew.create_range_constraint(1);
-        positive_skew.create_range_constraint(1);
-        (negative_skew + positive_skew).create_range_constraint(1);
+        if constexpr (C::type == ComposerType::PLOOKUP) {
+            ctx->create_new_range_constraint(negative_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_new_range_constraint(positive_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_new_range_constraint((negative_skew + positive_skew).witness_index, 1, "biggroup_nafs");
+        } else {
+            ctx->create_range_constraint(negative_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_range_constraint(positive_skew.witness_index, 1, "biggroup_nafs");
+            ctx->create_range_constraint((negative_skew + positive_skew).witness_index, 1, "biggroup_nafs");
+        }
 
         const auto reconstruct_bigfield_from_wnaf = [ctx](const std::vector<field_t<C>>& wnaf,
                                                           const field_t<C>& positive_skew,
@@ -378,14 +384,21 @@ std::vector<field_t<C>> element<C, Fq, Fr, G>::compute_wnaf(const Fr& scalar)
             offset_entry = (1ULL << (WNAF_SIZE - 1)) - 1 - (wnaf_values[i] & 0xffffff);
         }
         field_t<C> entry(witness_t<C>(ctx, offset_entry));
-
-        entry.create_range_constraint(WNAF_SIZE);
+        if constexpr (C::type == ComposerType::PLOOKUP) {
+            ctx->create_new_range_constraint(entry.witness_index, 1ULL << (WNAF_SIZE), "biggroup_nafs");
+        } else {
+            ctx->create_range_constraint(entry.witness_index, WNAF_SIZE, "biggroup_nafs");
+        }
         wnaf_entries.emplace_back(entry);
     }
 
     // add skew
     wnaf_entries.emplace_back(witness_t<C>(ctx, skew));
-    wnaf_entries[wnaf_entries.size() - 1].create_range_constraint(1);
+    if constexpr (C::type == ComposerType::PLOOKUP) {
+        ctx->create_new_range_constraint(wnaf_entries[wnaf_entries.size() - 1].witness_index, 1, "biggroup_nafs");
+    } else {
+        ctx->create_range_constraint(wnaf_entries[wnaf_entries.size() - 1].witness_index, 1, "biggroup_nafs");
+    }
 
     // TODO: VALIDATE SUM DOES NOT OVERFLOW P
 
@@ -494,15 +507,25 @@ std::vector<bool_t<C>> element<C, Fq, Fr, G>::compute_naf(const Fr& scalar, cons
             bit.context = ctx;
             bit.witness_index = witness_t<C>(ctx, true).witness_index; // flip sign
             bit.witness_bool = true;
-            ctx->create_range_constraint(
-                bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in non-next_entry case");
+            if constexpr (C::type == ComposerType::PLOOKUP) {
+                ctx->create_new_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in non-next_entry case");
+            } else {
+                ctx->create_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in non-next_entry case");
+            }
             naf_entries[num_rounds - i - 1] = bit;
         } else {
             bool_t<C> bit(ctx, false);
             bit.witness_index = witness_t<C>(ctx, false).witness_index; // don't flip sign
             bit.witness_bool = false;
-            ctx->create_range_constraint(
-                bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in next_entry case");
+            if constexpr (C::type == ComposerType::PLOOKUP) {
+                ctx->create_new_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in next_entry case");
+            } else {
+                ctx->create_range_constraint(
+                    bit.witness_index, 1, "biggroup_nafs: compute_naf extracted too many bits in next_entry case");
+            }
             naf_entries[num_rounds - i - 1] = bit;
         }
     }
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp
index 7d510e2794..756f955a7a 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_secp256k1.hpp
@@ -86,15 +86,18 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::secp256k1_ecdsa_mul(const element&
         // See `stdlib/memory/rom_table.hpp` for how indirect array accesses are implemented in UltraPlonk
         const auto& add_1 = endoP2_table[u2_hi_wnaf.wnaf[2 * i]];
         const auto& add_2 = P2_table[u2_lo_wnaf.wnaf[2 * i + 1]];
-        accumulator = accumulator.double_montgomery_ladder(add_1, add_2);
-
         const auto& add_3 = endoP1_table[u1_hi_wnaf.wnaf[i]];
         const auto& add_4 = P1_table[u1_lo_wnaf.wnaf[i]];
-        accumulator = accumulator.double_montgomery_ladder(add_3, add_4);
-
         const auto& add_5 = endoP2_table[u2_hi_wnaf.wnaf[2 * i + 1]];
         const auto& add_6 = P2_table[u2_lo_wnaf.wnaf[2 * i + 2]];
-        accumulator = accumulator.double_montgomery_ladder(add_5, add_6);
+
+        accumulator = accumulator.multiple_montgomery_ladder({ element::chain_add_accumulator(add_1),
+                                                               element::chain_add_accumulator(add_2),
+                                                               element::chain_add_accumulator(add_3) });
+
+        accumulator = accumulator.multiple_montgomery_ladder({ element::chain_add_accumulator(add_4),
+                                                               element::chain_add_accumulator(add_5),
+                                                               element::chain_add_accumulator(add_6) });
     }
 
     /**
diff --git a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp
index 9098799a7b..7f31017f67 100644
--- a/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp
+++ b/cpp/src/barretenberg/stdlib/primitives/biggroup/biggroup_tables.hpp
@@ -4,10 +4,27 @@ namespace proof_system::plonk {
 namespace stdlib {
 
 using plookup::MultiTableId;
+
+/**
+ * @brief Constructs a ROM table to look up linear combinations of group elements
+ *
+ * @tparam C
+ * @tparam Fq
+ * @tparam Fr
+ * @tparam G
+ * @tparam num_elements
+ * @tparam typename
+ * @param rom_data the ROM table we are writing into
+ * @param limb_max the maximum size of each limb in the ROM table.
+ *
+ * @details When reading a group element *out* of the ROM table, we must know the maximum value of each coordinate's
+ * limbs. We take this value to be the maximum of the maximum values of the input limbs into the table!
+ * @return std::array<twin_rom_table<C>, 5>
+ */
 template <typename C, class Fq, class Fr, class G>
 template <size_t num_elements, typename>
 std::array<twin_rom_table<C>, 5> element<C, Fq, Fr, G>::create_group_element_rom_tables(
-    const std::array<element, num_elements>& rom_data)
+    const std::array<element, num_elements>& rom_data, std::array<uint256_t, 8>& limb_max)
 {
     std::vector<std::array<field_t<C>, 2>> x_lo_limbs;
     std::vector<std::array<field_t<C>, 2>> x_hi_limbs;
@@ -16,6 +33,15 @@ std::array<twin_rom_table<C>, 5> element<C, Fq, Fr, G>::create_group_element_rom
     std::vector<std::array<field_t<C>, 2>> prime_limbs;
 
     for (size_t i = 0; i < num_elements; ++i) {
+        limb_max[0] = std::max(limb_max[0], rom_data[i].x.binary_basis_limbs[0].maximum_value);
+        limb_max[1] = std::max(limb_max[1], rom_data[i].x.binary_basis_limbs[1].maximum_value);
+        limb_max[2] = std::max(limb_max[2], rom_data[i].x.binary_basis_limbs[2].maximum_value);
+        limb_max[3] = std::max(limb_max[3], rom_data[i].x.binary_basis_limbs[3].maximum_value);
+        limb_max[4] = std::max(limb_max[4], rom_data[i].y.binary_basis_limbs[0].maximum_value);
+        limb_max[5] = std::max(limb_max[5], rom_data[i].y.binary_basis_limbs[1].maximum_value);
+        limb_max[6] = std::max(limb_max[6], rom_data[i].y.binary_basis_limbs[2].maximum_value);
+        limb_max[7] = std::max(limb_max[7], rom_data[i].y.binary_basis_limbs[3].maximum_value);
+
         x_lo_limbs.emplace_back(std::array<field_t<C>, 2>{ rom_data[i].x.binary_basis_limbs[0].element,
                                                            rom_data[i].x.binary_basis_limbs[1].element });
         x_hi_limbs.emplace_back(std::array<field_t<C>, 2>{ rom_data[i].x.binary_basis_limbs[2].element,
@@ -39,7 +65,7 @@ std::array<twin_rom_table<C>, 5> element<C, Fq, Fr, G>::create_group_element_rom
 template <typename C, class Fq, class Fr, class G>
 template <size_t, typename>
 element<C, Fq, Fr, G> element<C, Fq, Fr, G>::read_group_element_rom_tables(
-    const std::array<twin_rom_table<C>, 5>& tables, const field_t<C>& index)
+    const std::array<twin_rom_table<C>, 5>& tables, const field_t<C>& index, const std::array<uint256_t, 8>& limb_max)
 {
     const auto xlo = tables[0][index];
     const auto xhi = tables[1][index];
@@ -49,6 +75,15 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::read_group_element_rom_tables(
 
     Fq x_fq(xlo[0], xlo[1], xhi[0], xhi[1], xyprime[0]);
     Fq y_fq(ylo[0], ylo[1], yhi[0], yhi[1], xyprime[1]);
+    x_fq.binary_basis_limbs[0].maximum_value = limb_max[0];
+    x_fq.binary_basis_limbs[1].maximum_value = limb_max[1];
+    x_fq.binary_basis_limbs[2].maximum_value = limb_max[2];
+    x_fq.binary_basis_limbs[3].maximum_value = limb_max[3];
+    y_fq.binary_basis_limbs[0].maximum_value = limb_max[4];
+    y_fq.binary_basis_limbs[1].maximum_value = limb_max[5];
+    y_fq.binary_basis_limbs[2].maximum_value = limb_max[6];
+    y_fq.binary_basis_limbs[3].maximum_value = limb_max[7];
+
     const auto output = element(x_fq, y_fq);
     return output;
 }
@@ -64,17 +99,17 @@ element<C, Fq, Fr, G>::four_bit_table_plookup<X>::four_bit_table_plookup(const e
         element_table[i] = element_table[i - 1] + d2;
     }
     for (size_t i = 0; i < 8; ++i) {
-        element_table[i] = (-element_table[15 - i]).reduce();
+        element_table[i] = (-element_table[15 - i]);
     }
 
-    coordinates = create_group_element_rom_tables<16>(element_table);
+    coordinates = create_group_element_rom_tables<16>(element_table, limb_max);
 }
 
 template <typename C, class Fq, class Fr, class G>
 template <typename X>
 element<C, Fq, Fr, G> element<C, Fq, Fr, G>::four_bit_table_plookup<X>::operator[](const field_t<C>& index) const
 {
-    return read_group_element_rom_tables<16>(coordinates, index);
+    return read_group_element_rom_tables<16>(coordinates, index, limb_max);
 }
 
 template <class C, class Fq, class Fr, class G>
@@ -146,109 +181,134 @@ template <size_t length, typename X>
 element<C, Fq, Fr, G>::lookup_table_plookup<length, X>::lookup_table_plookup(const std::array<element, length>& inputs)
 {
     if constexpr (length == 2) {
-        element_table[0] = inputs[1] + inputs[0];
-        element_table[1] = inputs[1] - inputs[0];
+        auto [A0, A1] = inputs[1].add_sub(inputs[0]);
+        element_table[0] = A0;
+        element_table[1] = A1;
     } else if constexpr (length == 3) {
-        element R0 = inputs[1] + inputs[0];
-        element R1 = inputs[1] - inputs[0];
-        element_table[0] = inputs[2] + R0; // C + B + A
-        element_table[1] = inputs[2] + R1; // C + B - A
-        element_table[2] = inputs[2] - R1; // C - B + A
-        element_table[3] = inputs[2] - R0; // C - B - A
-    } else if constexpr (length == 4) {
-        element T0 = inputs[1] + inputs[0];
-        element T1 = inputs[1] - inputs[0];
-        element T2 = inputs[3] + inputs[2];
-        element T3 = inputs[3] - inputs[2];
+        auto [R0, R1] = inputs[1].add_sub(inputs[0]); // B ± A
 
-        element_table[0] = T2 + T0; // D + C + B + A
-        element_table[1] = T2 + T1; // D + C + B - A
-        element_table[2] = T2 - T1; // D + C - B + A
-        element_table[3] = T2 - T0; // D + C - B - A
-        element_table[4] = T3 + T0; // D - C + B + A
-        element_table[5] = T3 + T1; // D - C + B - A
-        element_table[6] = T3 - T1; // D - C - B + A
-        element_table[7] = T3 - T0; // D - C - B - A
-    } else if constexpr (length == 5) {
-        element A0 = inputs[1] + inputs[0]; // B + A
-        element A1 = inputs[1] - inputs[0]; // B - A
+        auto [T0, T1] = inputs[2].add_sub(R0); // C ± (B + A)
+        auto [T2, T3] = inputs[2].add_sub(R1); // C ± (B - A)
 
-        element T2 = inputs[3] + inputs[2]; // D + C
-        element T3 = inputs[3] - inputs[2]; // D - C
-
-        element E0 = inputs[4] + T2; // E + D + C // 0 0 0
-        element E1 = inputs[4] + T3; // E + D - C // 0 0 1
-        element E2 = inputs[4] - T3; // E - D + C // 0 1 0
-        element E3 = inputs[4] - T2; // E - D - C // 0 1 1
-
-        element_table[0] = E0 + A0;  // E + D + C + B + A // 0 0 0 0 0
-        element_table[1] = E0 + A1;  // E + D + C + B - A // 0 0 0 0 1
-        element_table[2] = E0 - A1;  // E + D + C - B + A // 0 0 0 1 0
-        element_table[3] = E0 - A0;  // E + D + C - B - A // 0 0 0 1 1
-        element_table[4] = E1 + A0;  // E + D - C + B + A // 0 0 1 0 0
-        element_table[5] = E1 + A1;  // E + D - C + B - A // 0 0 1 0 1
-        element_table[6] = E1 - A1;  // E + D - C - B + A // 0 0 1 1 0
-        element_table[7] = E1 - A0;  // E + D - C - B - A // 0 0 1 1 1
-        element_table[8] = E2 + A0;  // E - D + C + B + A // 0 1 0 0 0
-        element_table[9] = E2 + A1;  // E - D + C + B - A // 0 1 0 0 1
-        element_table[10] = E2 - A1; // E - D + C - B + A // 0 1 0 1 0
-        element_table[11] = E2 - A0; // E - D - C - B - A // 0 1 0 1 1
-        element_table[12] = E3 + A0; // E - D - C + B + A // 0 1 1 0 0
-        element_table[13] = E3 + A1; // E - D - C + B - A // 0 1 1 0 1
-        element_table[14] = E3 - A1; // E - D - C - B + A // 0 1 1 1 0
-        element_table[15] = E3 - A0; // E - D - C - B - A // 0 1 1 1 1
+        element_table[0] = T0;
+        element_table[1] = T2;
+        element_table[2] = T3;
+        element_table[3] = T1;
+    } else if constexpr (length == 4) {
+        auto [T0, T1] = inputs[1].add_sub(inputs[0]); // B ± A
+        auto [T2, T3] = inputs[3].add_sub(inputs[2]); // D ± C
+
+        auto [F0, F3] = T2.add_sub(T0); // (D + C) ± (B + A)
+        auto [F1, F2] = T2.add_sub(T1); // (D + C) ± (B - A)
+        auto [F4, F7] = T3.add_sub(T0); // (D - C) ± (B + A)
+        auto [F5, F6] = T3.add_sub(T1); // (D - C) ± (B - A)
+
+        element_table[0] = F0;
+        element_table[1] = F1;
+        element_table[2] = F2;
+        element_table[3] = F3;
+        element_table[4] = F4;
+        element_table[5] = F5;
+        element_table[6] = F6;
+        element_table[7] = F7;
+    } else if constexpr (length == 5) {
+        auto [A0, A1] = inputs[1].add_sub(inputs[0]); // B ± A
+        auto [T2, T3] = inputs[3].add_sub(inputs[2]); // D ± C
+
+        auto [E0, E3] = inputs[4].add_sub(T2); // E ± (D + C)
+        auto [E1, E2] = inputs[4].add_sub(T3); // E ± (D - C)
+
+        auto [F0, F3] = E0.add_sub(A0);
+        auto [F1, F2] = E0.add_sub(A1);
+        auto [F4, F7] = E1.add_sub(A0);
+        auto [F5, F6] = E1.add_sub(A1);
+        auto [F8, F11] = E2.add_sub(A0);
+        auto [F9, F10] = E2.add_sub(A1);
+        auto [F12, F15] = E3.add_sub(A0);
+        auto [F13, F14] = E3.add_sub(A1);
+
+        element_table[0] = F0;
+        element_table[1] = F1;
+        element_table[2] = F2;
+        element_table[3] = F3;
+        element_table[4] = F4;
+        element_table[5] = F5;
+        element_table[6] = F6;
+        element_table[7] = F7;
+        element_table[8] = F8;
+        element_table[9] = F9;
+        element_table[10] = F10;
+        element_table[11] = F11;
+        element_table[12] = F12;
+        element_table[13] = F13;
+        element_table[14] = F14;
+        element_table[15] = F15;
     } else if constexpr (length == 6) {
         // 44 adds! Only use this if it saves us adding another table to a multi-scalar-multiplication
-        element A0 = inputs[1] + inputs[0]; // B + A
-        element A1 = inputs[1] - inputs[0]; // B - A
-        element E0 = inputs[4] + inputs[3]; // E + D
-        element E1 = inputs[4] - inputs[3]; // E - D
-
-        element C0 = inputs[2] + A0; //  C + B + A
-        element C1 = inputs[2] + A1; //  C + B - A
-        element C2 = inputs[2] - A1; //  C - B + A
-        element C3 = inputs[2] - A0; //  C - B - A
-
-        element F0 = inputs[5] + E0; // F + E + D
-        element F1 = inputs[5] + E1; // F + E - D
-        element F2 = inputs[5] - E1; // F - E + D
-        element F3 = inputs[5] - E0; // F - E - E
-
-        element_table[0] = F0 + C0;
-        element_table[1] = F0 + C1;
-        element_table[2] = F0 + C2;
-        element_table[3] = F0 + C3;
-        element_table[4] = F0 - C3;
-        element_table[5] = F0 - C2;
-        element_table[6] = F0 - C1;
-        element_table[7] = F0 - C0;
-
-        element_table[8] = F1 + C0;
-        element_table[9] = F1 + C1;
-        element_table[10] = F1 + C2;
-        element_table[11] = F1 + C3;
-        element_table[12] = F1 - C3;
-        element_table[13] = F1 - C2;
-        element_table[14] = F1 - C1;
-        element_table[15] = F1 - C0;
-
-        element_table[16] = F2 + C0;
-        element_table[17] = F2 + C1;
-        element_table[18] = F2 + C2;
-        element_table[19] = F2 + C3;
-        element_table[20] = F2 - C3;
-        element_table[21] = F2 - C2;
-        element_table[22] = F2 - C1;
-        element_table[23] = F2 - C0;
-
-        element_table[24] = F3 + C0;
-        element_table[25] = F3 + C1;
-        element_table[26] = F3 + C2;
-        element_table[27] = F3 + C3;
-        element_table[28] = F3 - C3;
-        element_table[29] = F3 - C2;
-        element_table[30] = F3 - C1;
-        element_table[31] = F3 - C0;
+
+        auto [A0, A1] = inputs[1].add_sub(inputs[0]);
+        auto [E0, E1] = inputs[4].add_sub(inputs[3]);
+        auto [C0, C3] = inputs[2].add_sub(A0);
+        auto [C1, C2] = inputs[2].add_sub(A1);
+
+        auto [F0, F3] = inputs[5].add_sub(E0);
+        auto [F1, F2] = inputs[5].add_sub(E1);
+
+        auto [R0, R7] = F0.add_sub(C0);
+        auto [R1, R6] = F0.add_sub(C1);
+        auto [R2, R5] = F0.add_sub(C2);
+        auto [R3, R4] = F0.add_sub(C3);
+
+        auto [S0, S7] = F1.add_sub(C0);
+        auto [S1, S6] = F1.add_sub(C1);
+        auto [S2, S5] = F1.add_sub(C2);
+        auto [S3, S4] = F1.add_sub(C3);
+
+        auto [U0, U7] = F2.add_sub(C0);
+        auto [U1, U6] = F2.add_sub(C1);
+        auto [U2, U5] = F2.add_sub(C2);
+        auto [U3, U4] = F2.add_sub(C3);
+
+        auto [W0, W7] = F3.add_sub(C0);
+        auto [W1, W6] = F3.add_sub(C1);
+        auto [W2, W5] = F3.add_sub(C2);
+        auto [W3, W4] = F3.add_sub(C3);
+
+        element_table[0] = R0;
+        element_table[1] = R1;
+        element_table[2] = R2;
+        element_table[3] = R3;
+        element_table[4] = R4;
+        element_table[5] = R5;
+        element_table[6] = R6;
+        element_table[7] = R7;
+
+        element_table[8] = S0;
+        element_table[9] = S1;
+        element_table[10] = S2;
+        element_table[11] = S3;
+        element_table[12] = S4;
+        element_table[13] = S5;
+        element_table[14] = S6;
+        element_table[15] = S7;
+
+        element_table[16] = U0;
+        element_table[17] = U1;
+        element_table[18] = U2;
+        element_table[19] = U3;
+        element_table[20] = U4;
+        element_table[21] = U5;
+        element_table[22] = U6;
+        element_table[23] = U7;
+
+        element_table[24] = W0;
+        element_table[25] = W1;
+        element_table[26] = W2;
+        element_table[27] = W3;
+        element_table[28] = W4;
+        element_table[29] = W5;
+        element_table[30] = W6;
+        element_table[31] = W7;
     } else if constexpr (length == 7) {
         // 82 adds! This one is not worth using...
 
@@ -341,9 +401,9 @@ element<C, Fq, Fr, G>::lookup_table_plookup<length, X>::lookup_table_plookup(con
         element_table[63] = G3 - E0;
     }
     for (size_t i = 0; i < table_size / 2; ++i) {
-        element_table[i + table_size / 2] = (-element_table[table_size / 2 - 1 - i]).reduce();
+        element_table[i + table_size / 2] = (-element_table[table_size / 2 - 1 - i]);
     }
-    coordinates = create_group_element_rom_tables<table_size>(element_table);
+    coordinates = create_group_element_rom_tables<table_size>(element_table, limb_max);
 }
 
 template <typename C, class Fq, class Fr, class G>
@@ -356,7 +416,7 @@ element<C, Fq, Fr, G> element<C, Fq, Fr, G>::lookup_table_plookup<length, X>::ge
         accumulators.emplace_back(field_t<C>(bits[i]) * (1ULL << i));
     }
     field_t<C> index = field_t<C>::accumulate(accumulators);
-    return read_group_element_rom_tables<table_size>(coordinates, index);
+    return read_group_element_rom_tables<table_size>(coordinates, index, limb_max);
 }
 
 /**
diff --git a/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp b/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp
index 64d7e69fc8..005ce5aec2 100644
--- a/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp
+++ b/cpp/src/barretenberg/stdlib/recursion/aggregation_state/aggregation_state.hpp
@@ -28,9 +28,45 @@ template <typename Curve> struct aggregation_state {
         //    has_data == other.has_data; can't compare as native
     };
 
+    /**
+     * @brief TODO(@dbanks12 please migrate A3 circuits to using `assign_object_to_proof_outputs`. Much safer to not
+     * independently track `proof_witness_indices` and whether object has been assigned to public inputs)
+     *
+     */
     void add_proof_outputs_as_public_inputs()
     {
-        ASSERT(proof_witness_indices.size() > 0);
+        auto* context = P0.get_context();
+        context->add_recursive_proof(proof_witness_indices);
+    }
+
+    void assign_object_to_proof_outputs()
+    {
+        if (proof_witness_indices.size() == 0) {
+            std::cerr << "warning. calling `add_proof_outputs_as_public_inputs`, but aggregation object already has "
+                         "assigned proof outputs to public inputs.";
+            return;
+        }
+
+        P0 = P0.reduce();
+        P1 = P1.reduce();
+        proof_witness_indices = {
+            P0.x.binary_basis_limbs[0].element.normalize().witness_index,
+            P0.x.binary_basis_limbs[1].element.normalize().witness_index,
+            P0.x.binary_basis_limbs[2].element.normalize().witness_index,
+            P0.x.binary_basis_limbs[3].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[0].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[1].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[2].element.normalize().witness_index,
+            P0.y.binary_basis_limbs[3].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[0].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[1].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[2].element.normalize().witness_index,
+            P1.x.binary_basis_limbs[3].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[0].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[1].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[2].element.normalize().witness_index,
+            P1.y.binary_basis_limbs[3].element.normalize().witness_index,
+        };
 
         auto* context = P0.get_context();
 
diff --git a/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp b/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp
index 5ca0439d9c..9db2f12dc9 100644
--- a/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp
+++ b/cpp/src/barretenberg/stdlib/recursion/transcript/transcript.hpp
@@ -12,10 +12,11 @@
 #include "../../commitment/pedersen/pedersen_plookup.hpp"
 #include "../../primitives/bigfield/bigfield.hpp"
 #include "../../primitives/biggroup/biggroup.hpp"
-#include "../../primitives/bool/bool.hpp"
 #include "../../primitives/field/field.hpp"
 #include "../../primitives/witness/witness.hpp"
+#include "../../primitives/bool/bool.hpp"
 
+#include "../verification_key//verification_key.hpp"
 namespace proof_system::plonk {
 namespace stdlib {
 namespace recursion {
@@ -133,196 +134,140 @@ template <typename Composer> class Transcript {
             ++current_round;
             return;
         }
-        const size_t bytes_per_element = 31;
-
-        // split element into 2 limbs and insert into element_buffer
-        // each entry in element_buffer is 31 bytes
-        const auto split = [&](field_pt& work_element,
-                               std::vector<field_pt>& element_buffer,
-                               const field_pt& element,
-                               size_t& current_byte_counter,
-                               const size_t num_bytes) {
-            uint256_t element_u256(element.get_value());
-            size_t hi_bytes = bytes_per_element - current_byte_counter;
-            if (hi_bytes >= num_bytes) {
-                // hmm
-                size_t new_byte_counter = current_byte_counter + num_bytes;
-                field_pt hi = element;
-                const size_t leftovers = bytes_per_element - new_byte_counter;
-                field_pt buffer_shift =
-                    field_pt(context, barretenberg::fr(uint256_t(1) << ((uint64_t)leftovers * 8ULL)));
-                work_element = work_element + (hi * buffer_shift);
-                work_element = work_element.normalize();
-                current_byte_counter = new_byte_counter;
-                if (current_byte_counter == bytes_per_element) {
-                    current_byte_counter = 0;
-                    element_buffer.push_back(work_element);
-                    work_element = field_pt(context, barretenberg::fr(0));
-                }
-                return;
-            }
-            const size_t lo_bytes = num_bytes - hi_bytes;
-            field_pt lo = witness_t(context, barretenberg::fr(element_u256.slice(0, lo_bytes * 8)));
-            field_pt hi = witness_t(context, barretenberg::fr(element_u256.slice(lo_bytes * 8, 256)));
-            lo.create_range_constraint(lo_bytes * 8);
-            hi.create_range_constraint(hi_bytes * 8);
-            field_pt shift(context, barretenberg::fr(uint256_t(1ULL) << (uint64_t)lo_bytes * 8ULL));
-            field_pt sum = lo + (hi * shift);
-            if (!element.is_constant() || !sum.is_constant()) {
-                sum.assert_equal(element);
-            }
-            current_byte_counter = (current_byte_counter + num_bytes) % bytes_per_element;
-
-            // if current_byte_counter == 0 we've rolled over
-            if (current_byte_counter == 0) {
-                element_buffer.push_back(work_element + hi);
-                element_buffer.push_back(lo);
-                work_element = field_pt(context, 0);
-            } else {
-                work_element = work_element + hi;
-
-                element_buffer.push_back(work_element);
-
-                field_t lo_shift(
-                    context, barretenberg::fr(uint256_t(1ULL) << ((31ULL - (uint64_t)current_byte_counter) * 8ULL)));
-                work_element = (lo * lo_shift);
-                work_element = work_element.normalize();
-            }
-        };
-
-        std::vector<field_pt> compression_buffer;
         field_pt working_element(context);
 
-        size_t byte_counter = 0;
+        // maximum number of bytes we can store in a field element w/o wrapping modulus is 31.
+        // while we could store more *bits*, we want `preimage_buffer` to mirror how data is formatted
+        // when we serialize field/group elements natively (i.e. a byte array)
+        static constexpr size_t NUM_BITS_PER_PREIMAGE_ELEMENT = 31UL * 8UL;
+        PedersenPreimageBuilder<Composer, NUM_BITS_PER_PREIMAGE_ELEMENT> preimage_buffer(context);
         if (current_round > 0) {
-            split(working_element, compression_buffer, field_pt(current_challenge), byte_counter, 32);
+            preimage_buffer.add_element(current_challenge);
         }
         for (auto manifest_element : get_manifest().get_round_manifest(current_round).elements) {
             if (manifest_element.num_bytes == 32 && manifest_element.name != "public_inputs") {
-                split(working_element,
-                      compression_buffer,
-                      get_field_element(manifest_element.name),
-                      byte_counter,
-                      manifest_element.num_bytes);
+                preimage_buffer.add_element(get_field_element(manifest_element.name));
             } else if (manifest_element.num_bytes == 64 && manifest_element.name != "public_inputs") {
                 group_pt point = get_circuit_group_element(manifest_element.name);
 
-                field_pt y_hi =
-                    point.y.binary_basis_limbs[2].element + (point.y.binary_basis_limbs[3].element * fq_pt::shift_1);
-                field_pt y_lo =
-                    point.y.binary_basis_limbs[0].element + (point.y.binary_basis_limbs[1].element * fq_pt::shift_1);
-                field_pt x_hi =
-                    point.x.binary_basis_limbs[2].element + (point.x.binary_basis_limbs[3].element * fq_pt::shift_1);
-                field_pt x_lo =
-                    point.x.binary_basis_limbs[0].element + (point.x.binary_basis_limbs[1].element * fq_pt::shift_1);
-                const size_t lo_bytes = fq_pt::NUM_LIMB_BITS / 4;
-                const size_t hi_bytes = 32 - lo_bytes;
-
-                split(working_element, compression_buffer, y_hi, byte_counter, hi_bytes);
-                split(working_element, compression_buffer, y_lo, byte_counter, lo_bytes);
-                split(working_element, compression_buffer, x_hi, byte_counter, hi_bytes);
-                split(working_element, compression_buffer, x_lo, byte_counter, lo_bytes);
+                // In our buffer, we want to represent each field element as occupying 256 bits of data (to match what
+                // the native transcript does)
+                const auto& x = point.x;
+                const auto& y = point.y;
+                constexpr size_t last_limb_bits = 256 - (fq_pt::NUM_LIMB_BITS * 3);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[3].element,
+                                                                           last_limb_bits);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[2].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[1].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[0].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[3].element,
+                                                                           last_limb_bits);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[2].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[1].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+                preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[0].element,
+                                                                           fq_pt::NUM_LIMB_BITS);
+
             } else if (manifest_element.name == "public_inputs") {
                 std::vector<field_pt> field_array = get_field_element_vector(manifest_element.name);
                 for (size_t i = 0; i < field_array.size(); ++i) {
-                    split(working_element, compression_buffer, field_array[i], byte_counter, 32);
+                    preimage_buffer.add_element(field_array[i]);
                 }
             } else if (manifest_element.num_bytes < 32 && manifest_element.name != "public_inputs") {
-                split(working_element,
-                      compression_buffer,
-                      get_field_element(manifest_element.name),
-                      byte_counter,
-                      manifest_element.num_bytes);
+                // TODO(zac): init round data is being grabbed out of the manifest and not the vkey
+                preimage_buffer.add_element_with_existing_range_constraint(get_field_element(manifest_element.name),
+                                                                           manifest_element.num_bytes * 8);
             }
         }
-        std::vector<byte_array<Composer>> round_challenges;
+        std::vector<field_pt> round_challenges_new;
 
-        if (byte_counter != 0) {
-            const uint256_t down_shift = uint256_t(1) << uint256_t((bytes_per_element - byte_counter) * 8);
-            working_element = working_element / barretenberg::fr(down_shift);
-            working_element = working_element.normalize();
+        field_pt T0;
+        T0 = preimage_buffer.compress(0);
 
-            compression_buffer.push_back(working_element);
-        }
+        // helper method to slice a challenge into 128-bit slices
+        const auto slice_into_halves = [&](const field_pt& in, const size_t low_bits = 128) {
+            uint256_t v = in.get_value();
+            uint256_t lo = v.slice(0, low_bits);
+            uint256_t hi = v.slice(low_bits, 256);
 
-        field_pt T0;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            T0 = stdlib::pedersen_plookup_commitment<Composer>::compress(compression_buffer);
-        } else {
-            T0 = stdlib::pedersen_commitment<Composer>::compress(compression_buffer);
-        }
-        byte_array<Composer> compressed_buffer(T0);
+            field_pt y_lo = field_pt::from_witness(context, lo);
+            field_pt y_hi = field_pt::from_witness(context, hi);
 
-        // TODO(@zac-williamson) make this a Poseidon hash
-        byte_array<Composer> base_hash;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            std::vector<field_pt> compression_buffer;
-            field_pt working_element(context);
-            size_t byte_counter = 0;
-            split(working_element, compression_buffer, field_pt(compressed_buffer), byte_counter, 32);
-            if (byte_counter != 0) {
-                const uint256_t down_shift = uint256_t(1) << uint256_t((bytes_per_element - byte_counter) * 8);
-                working_element = working_element / barretenberg::fr(down_shift);
-                working_element = working_element.normalize();
-                compression_buffer.push_back(working_element);
+            y_lo.create_range_constraint(low_bits);
+            y_hi.create_range_constraint(254 - low_bits);
+
+            in.add_two(-y_lo, -y_hi * (uint256_t(1) << low_bits)).assert_equal(0);
+
+            // Validate the sum of our two halves does not exceed the circuit modulus over the integers
+            constexpr uint256_t modulus = fr::modulus;
+            const field_pt r_lo = field_pt(context, modulus.slice(0, low_bits));
+            const field_pt r_hi = field_pt(context, modulus.slice(low_bits, 256));
+
+            bool need_borrow = (uint256_t(y_lo.get_value()) > uint256_t(r_lo.get_value()));
+            field_pt borrow = field_pt::from_witness(context, need_borrow);
+
+            // directly call `create_new_range_constraint` to avoid creating an arithmetic gate
+            if constexpr (Composer::type == ComposerType::PLOOKUP) {
+                context->create_new_range_constraint(borrow.get_witness_index(), 1, "borrow");
+            } else {
+                context->create_range_constraint(borrow.get_witness_index(), 1, "borrow");
             }
-            base_hash = stdlib::pedersen_plookup_commitment<Composer>::compress(compression_buffer);
+
+            // Hi range check = r_hi - y_hi - borrow
+            // Lo range check = r_lo - y_lo + borrow * 2^{126}
+            field_pt res_hi = (r_hi - y_hi) - borrow;
+            field_pt res_lo = (r_lo - y_lo) + (borrow * (uint256_t(1) << low_bits));
+
+            res_hi.create_range_constraint(modulus.get_msb() + 1 - low_bits);
+            res_lo.create_range_constraint(low_bits);
+
+            return std::array<field_pt, 2>{ y_lo, y_hi };
+        };
+
+        field_pt base_hash;
+        if constexpr (Composer::type == ComposerType::PLOOKUP) {
+            base_hash = stdlib::pedersen_plookup_commitment<Composer>::compress(std::vector<field_pt>{ T0 }, 0);
         } else {
-            base_hash = stdlib::blake3s(compressed_buffer);
+            base_hash = stdlib::pedersen_commitment<Composer>::compress(std::vector<field_pt>{ T0 }, 0);
         }
-        byte_array<Composer> first(field_pt(0), 16);
-        first.write(base_hash.slice(0, 16));
-        round_challenges.push_back(first);
+        auto hash_halves = slice_into_halves(base_hash);
+        round_challenges_new.push_back(hash_halves[1]);
 
         if (num_challenges > 1) {
-            byte_array<Composer> second(field_pt(0), 16);
-            second.write(base_hash.slice(16, 16));
-            round_challenges.push_back(second);
+            round_challenges_new.push_back(hash_halves[0]);
         }
+        base_hash = (slice_into_halves(base_hash, 8)[1] * 256).normalize();
 
-        // This block of code only executes for num_challenges > 2, which (currently) only happens in the nu round when
-        // we need to generate short scalars. In this case, we generate 32-byte challenges and split them in half to get
-        // the relevant challenges.
+        // This block of code only executes for num_challenges > 2, which (currently) only happens in the nu round
+        // when we need to generate short scalars. In this case, we generate 32-byte challenges and split them in
+        // half to get the relevant challenges.
         for (size_t i = 2; i < num_challenges; i += 2) {
-            byte_array<Composer> rolling_buffer = base_hash;
-            byte_array<Composer> hash_output;
+            // TODO(@zac-williamson) make this a Poseidon hash not a Pedersen hash
+            field_pt hash_output;
             if constexpr (Composer::type == ComposerType::PLOOKUP) {
-                // TODO(@zac-williamson) make this a Poseidon hash not a Pedersen hash
-                std::vector<field_pt> compression_buffer;
-                field_pt working_element(context);
-                size_t byte_counter = 0;
-                split(working_element, compression_buffer, field_pt(rolling_buffer), byte_counter, 32);
-                split(working_element, compression_buffer, field_pt(field_pt(i / 2)), byte_counter, 1);
-                if (byte_counter != 0) {
-                    const uint256_t down_shift = uint256_t(1) << uint256_t((bytes_per_element - byte_counter) * 8);
-                    working_element = working_element / barretenberg::fr(down_shift);
-                    working_element = working_element.normalize();
-                    compression_buffer.push_back(working_element);
-                }
-                hash_output = stdlib::pedersen_plookup_commitment<Composer>::compress(compression_buffer);
+                hash_output = stdlib::pedersen_plookup_commitment<Composer>::compress(
+                    std::vector<field_pt>{ (base_hash + field_pt(i / 2)).normalize() }, 0);
             } else {
-                rolling_buffer.write(byte_array<Composer>(field_pt(i / 2), 1));
-                hash_output = stdlib::blake3s(rolling_buffer);
+                hash_output = stdlib::pedersen_commitment<Composer>::compress(
+                    std::vector<field_pt>{ (base_hash + field_pt(i / 2)).normalize() }, 0);
             }
-            byte_array<Composer> hi(field_pt(0), 16);
-            hi.write(hash_output.slice(0, 16));
-            round_challenges.push_back(hi);
-
+            auto hash_halves = slice_into_halves(hash_output);
+            round_challenges_new.push_back(hash_halves[1]);
             if (i + 1 < num_challenges) {
-                byte_array<Composer> lo(field_pt(0), 16);
-                lo.write(hash_output.slice(16, 16));
-                round_challenges.push_back(lo);
+                round_challenges_new.push_back(hash_halves[0]);
             }
         }
-
-        current_challenge = round_challenges[round_challenges.size() - 1];
+        current_challenge = round_challenges_new[round_challenges_new.size() - 1];
         ++current_round;
-
         challenge_keys.push_back(challenge_name);
 
         std::vector<field_pt> challenge_elements;
-        for (const auto& challenge : round_challenges) {
-            challenge_elements.push_back(static_cast<field_pt>(challenge));
+        for (const auto& challenge : round_challenges_new) {
+            challenge_elements.push_back(challenge);
         }
         challenge_values.push_back(challenge_elements);
     }
@@ -420,7 +365,7 @@ template <typename Composer> class Transcript {
 
   private:
     transcript::Transcript transcript_base;
-    byte_array<Composer> current_challenge;
+    field_pt current_challenge;
 
     mutable std::vector<std::string> field_vector_keys;
     mutable std::vector<std::vector<field_pt>> field_vector_values;
diff --git a/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp b/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp
index 57b8a39207..915f63e5a3 100644
--- a/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp
+++ b/cpp/src/barretenberg/stdlib/recursion/verification_key/verification_key.hpp
@@ -20,10 +20,163 @@
 #include "../../commitment/pedersen/pedersen_plookup.hpp"
 #include "../../primitives/curves/bn254.hpp"
 
+#include "barretenberg/crypto/pedersen_commitment/convert_buffer_to_field.hpp"
+
 namespace proof_system::plonk {
 namespace stdlib {
 namespace recursion {
 
+/**
+ * @brief Constructs a packed buffer of field elements to be fed into a Pedersen compress function
+ *        Goal is to concatenate multiple inputs together into a single field element if the inputs are known to be
+ * small. Produces a vector of field elements where the maximum number of bits per element is `bits_per_element`.
+ *
+ * @details When calling `pedersen::compress` on the final buffer, we can skip the range checks normally performed in
+ * the compress method, because we know the sums of the scalar slices cannot exceed the field modulus. This requires
+ * `bits_per_element < modulus bits`
+ * @tparam Composer
+ * @tparam bits_per_element
+ */
+template <class Composer, size_t bits_per_element = 248> struct PedersenPreimageBuilder {
+    using field_pt = field_t<Composer>;
+    using witness_pt = witness_t<Composer>;
+
+    Composer* context;
+
+    PedersenPreimageBuilder(Composer* ctx = nullptr)
+        : context(ctx){};
+
+    field_pt compress(const size_t hash_index)
+    {
+        // we can only use relaxed range checks in pedersen::compress iff bits_per_element < modulus bits
+        static_assert(bits_per_element < uint256_t(barretenberg::fr::modulus).get_msb());
+
+        if (current_bit_counter != 0) {
+            const uint256_t down_shift = uint256_t(1) << uint256_t((bits_per_element - current_bit_counter));
+            for (auto& x : work_element) {
+                x = x / barretenberg::fr(down_shift);
+            }
+            preimage_data.push_back(field_pt::accumulate(work_element));
+        }
+        if constexpr (Composer::type == ComposerType::PLOOKUP) {
+            return pedersen_plookup_commitment<Composer>::compress_with_relaxed_range_constraints(preimage_data,
+                                                                                                  hash_index);
+        } else {
+            return pedersen_commitment<Composer>::compress(preimage_data, hash_index);
+        }
+    }
+
+    /**
+     * @brief preimage_data is a bit-array where `bits_per_element` number of bits are packed into a single field
+     * element
+     */
+    std::vector<field_pt> preimage_data;
+
+    /**
+     * @brief work_element represents the leading element to be added into `preimage_data`.
+     *        Vector is composed of field elements that represent bit chunks of a known length,
+     *        such that the sum of the bit chunks < bits_per_element
+     */
+    std::vector<field_pt> work_element;
+
+    size_t current_bit_counter = 0;
+
+    void add_element(const field_pt& element) { slice_element(element, 256); }
+
+    void add_element_with_existing_range_constraint(const field_pt& element, const size_t num_bits)
+    {
+        slice_element(element, num_bits);
+    }
+
+    /**
+     * @brief Populate `preimage_data` with element whose size is known to be `num_bits`.
+     * `preimage_data` is treated as a bit-array where `bits_per_element` number of bits are packed into a single field
+     * element. `slice_element` will:
+     *
+     * 1. determine how many bits are remaining in work_element
+     * 2. if remaining bits > num_bits, slice `element` into 2 chunks hi/lo
+     * 3. fill work_element with `hi` chunk (or the full element if possible)
+     * 4. (if work_element is full) combine work_element chunks into a field element and push onto `preimage_data`
+     * 4. (if required) create a new work_element and populate with `lo`
+     *
+     * @param element
+     * @param num_bits
+     */
+    void slice_element(const field_pt& element, const size_t num_bits)
+    {
+        ASSERT(context != nullptr);
+        uint256_t element_u256(element.get_value());
+        size_t hi_bits = bits_per_element - current_bit_counter;
+        if (hi_bits >= num_bits) {
+            // hmm
+            size_t new_bit_counter = current_bit_counter + num_bits;
+            field_pt hi = element;
+            const size_t leftovers = bits_per_element - new_bit_counter;
+            field_pt buffer_shift = field_pt(context, barretenberg::fr(uint256_t(1) << ((uint64_t)leftovers)));
+            work_element.emplace_back(hi * buffer_shift);
+            current_bit_counter = new_bit_counter;
+            if (current_bit_counter == bits_per_element) {
+                current_bit_counter = 0;
+                preimage_data.push_back(field_pt::accumulate(work_element));
+
+                work_element = std::vector<field_pt>();
+            }
+            return;
+        }
+        const size_t lo_bits = num_bits - hi_bits;
+        field_pt lo = witness_t(context, barretenberg::fr(element_u256.slice(0, lo_bits)));
+        field_pt hi = witness_t(context, barretenberg::fr(element_u256.slice(lo_bits, 256)));
+        lo.create_range_constraint(lo_bits);
+        hi.create_range_constraint(hi_bits);
+        field_pt shift(context, barretenberg::fr(uint256_t(1ULL) << (uint64_t)lo_bits));
+        if (!element.is_constant() || !lo.is_constant() || !hi.is_constant()) {
+            lo.add_two(hi * shift, -element).assert_equal(0);
+        }
+
+        constexpr uint256_t modulus = barretenberg::fr::modulus;
+        constexpr size_t modulus_bits = modulus.get_msb();
+
+        // If our input is a full field element we must validate the sum of our slices is < p
+        if (num_bits >= modulus_bits) {
+            const field_pt r_lo = field_pt(context, modulus.slice(0, lo_bits));
+            const field_pt r_hi = field_pt(context, modulus.slice(lo_bits, num_bits));
+
+            bool need_borrow = (uint256_t(lo.get_value()) > uint256_t(r_lo.get_value()));
+            field_pt borrow = field_pt::from_witness(context, need_borrow);
+
+            // directly call `create_new_range_constraint` to avoid creating an arithmetic gate
+            if constexpr (Composer::type == ComposerType::PLOOKUP) {
+                context->create_new_range_constraint(borrow.get_witness_index(), 1, "borrow");
+            } else {
+                context->create_range_constraint(borrow.get_witness_index(), 1, "borrow");
+            }
+            // Hi range check = r_hi - y_hi - borrow
+            // Lo range check = r_lo - y_lo + borrow * 2^{126}
+            field_t res_hi = (r_hi - hi) - borrow;
+            field_t res_lo = (r_lo - lo) + (borrow * (uint256_t(1) << lo_bits));
+
+            res_hi.create_range_constraint(modulus_bits + 1 - lo_bits);
+            res_lo.create_range_constraint(lo_bits);
+        }
+        current_bit_counter = (current_bit_counter + num_bits) % bits_per_element;
+
+        // if current_bit_counter == 0 we've rolled over
+        if (current_bit_counter == 0) {
+            work_element.emplace_back(hi);
+            preimage_data.push_back(field_pt::accumulate(work_element));
+            preimage_data.push_back(lo);
+            work_element = std::vector<field_pt>();
+        } else {
+            work_element.emplace_back(hi);
+            preimage_data.push_back(field_pt::accumulate(work_element));
+            field_t lo_shift(context,
+                             barretenberg::fr(uint256_t(1ULL) << ((bits_per_element - (uint64_t)current_bit_counter))));
+            work_element = std::vector<field_pt>();
+            work_element.emplace_back(lo * lo_shift);
+        }
+    };
+};
+
 template <typename Composer> struct evaluation_domain {
     static evaluation_domain from_witness(Composer* ctx, const barretenberg::evaluation_domain& input)
     {
@@ -51,44 +204,6 @@ template <typename Composer> struct evaluation_domain {
         return domain;
     }
 
-    field_t<Composer> compress() const
-    {
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            field_t<Composer> out = pedersen_plookup_commitment<Composer>::compress({
-                root,
-                domain,
-                generator,
-            });
-            return out;
-        } else {
-            field_t<Composer> out = pedersen_commitment<Composer>::compress({
-                root,
-                domain,
-                generator,
-            });
-            return out;
-        }
-    }
-
-    static barretenberg::fr compress_native(const barretenberg::evaluation_domain& input)
-    {
-        barretenberg::fr out;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            out = crypto::pedersen_commitment::lookup::compress_native({
-                input.root,
-                input.domain,
-                input.generator,
-            });
-        } else {
-            out = crypto::pedersen_commitment::compress_native({
-                input.root,
-                input.domain,
-                input.generator,
-            });
-        }
-        return out;
-    }
-
     field_t<Composer> root;
     field_t<Composer> root_inverse;
     field_t<Composer> domain;
@@ -120,9 +235,15 @@ template <typename Curve> struct verification_key {
         key->num_public_inputs = witness_t<Composer>(ctx, input_key->num_public_inputs);
         key->domain = evaluation_domain<Composer>::from_witness(ctx, input_key->domain);
         key->contains_recursive_proof = witness_t<Composer>(ctx, input_key->contains_recursive_proof);
-
         for (const auto& [tag, value] : input_key->commitments) {
-            key->commitments.insert({ tag, Curve::g1_ct::from_witness(ctx, value) });
+            // We do not perform on_curve() circuit checks when constructing the Curve::g1_ct element.
+            // The assumption is that the circuit creator is honest and that the verification key hash (or some other
+            // method) will be used to ensure the provided key matches the key produced by the circuit creator.
+            // If the circuit creator is not honest, the entire set of circuit constraints being proved over cannot be
+            // trusted!
+            const typename Curve::fq_ct x = Curve::fq_ct::from_witness(ctx, value.x);
+            const typename Curve::fq_ct y = Curve::fq_ct::from_witness(ctx, value.y);
+            key->commitments.insert({ tag, typename Curve::g1_ct(x, y) });
         }
 
         return key;
@@ -189,71 +310,65 @@ template <typename Curve> struct verification_key {
   public:
     field_t<Composer> compress(size_t const hash_index = 0)
     {
-        field_t<Composer> compressed_domain = domain.compress();
-
-        std::vector<field_t<Composer>> preimage_data;
-        preimage_data.push_back(Composer::type);
-        preimage_data.push_back(compressed_domain);
-        preimage_data.push_back(num_public_inputs);
+        PedersenPreimageBuilder<Composer> preimage_buffer(context);
+
+        field_t<Composer> composer_type = witness_t<Composer>::create_constant_witness(context, Composer::type);
+        domain.generator.create_range_constraint(16, "domain.generator");
+        domain.domain.create_range_constraint(32, "domain.generator");
+        num_public_inputs.create_range_constraint(32, "num_public_inputs");
+        preimage_buffer.add_element_with_existing_range_constraint(composer_type, 8);
+        preimage_buffer.add_element_with_existing_range_constraint(domain.generator, 16); // coset generator is small
+        preimage_buffer.add_element_with_existing_range_constraint(domain.domain, 32);
+        preimage_buffer.add_element_with_existing_range_constraint(num_public_inputs, 32);
+        constexpr size_t limb_bits = Curve::fq_ct::NUM_LIMB_BITS;
+        constexpr size_t last_limb_bits = 256 - (limb_bits * 3);
         for (const auto& [tag, selector] : commitments) {
-            preimage_data.push_back(selector.x.binary_basis_limbs[0].element);
-            preimage_data.push_back(selector.x.binary_basis_limbs[1].element);
-            preimage_data.push_back(selector.x.binary_basis_limbs[2].element);
-            preimage_data.push_back(selector.x.binary_basis_limbs[3].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[0].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[1].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[2].element);
-            preimage_data.push_back(selector.y.binary_basis_limbs[3].element);
-        }
-
-        field_t<Composer> compressed_key;
-        if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            compressed_key = pedersen_plookup_commitment<Composer>::compress(preimage_data, hash_index);
-        } else {
-            compressed_key = pedersen_commitment<Composer>::compress(preimage_data, hash_index);
+            const auto& x = selector.x;
+            const auto& y = selector.y;
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[3].element, last_limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[2].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[1].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(y.binary_basis_limbs[0].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[3].element, last_limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[2].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[1].element, limb_bits);
+            preimage_buffer.add_element_with_existing_range_constraint(x.binary_basis_limbs[0].element, limb_bits);
         }
+        preimage_buffer.add_element(domain.root);
+        field_t<Composer> compressed_key = preimage_buffer.compress(hash_index);
         return compressed_key;
     }
 
-    static barretenberg::fr compress_native(const std::shared_ptr<plonk::verification_key>& key,
-                                            const size_t hash_index = 0)
+    static barretenberg::fr compress_native(const std::shared_ptr<plonk::verification_key>& key, const size_t = 0)
     {
-        barretenberg::fr compressed_domain = evaluation_domain<Composer>::compress_native(key->domain);
-
-        constexpr size_t num_limb_bits = bn254<plonk::UltraComposer>::fq_ct::NUM_LIMB_BITS;
-        const auto split_bigfield_limbs = [](const uint256_t& element) {
-            std::vector<barretenberg::fr> limbs;
-            limbs.push_back(element.slice(0, num_limb_bits));
-            limbs.push_back(element.slice(num_limb_bits, num_limb_bits * 2));
-            limbs.push_back(element.slice(num_limb_bits * 2, num_limb_bits * 3));
-            limbs.push_back(element.slice(num_limb_bits * 3, num_limb_bits * 4));
-            return limbs;
-        };
-
-        std::vector<barretenberg::fr> preimage_data;
-        preimage_data.push_back(Composer::type);
-        preimage_data.push_back(compressed_domain);
-        preimage_data.push_back(key->num_public_inputs);
+        std::vector<uint8_t> preimage_data;
+
+        preimage_data.push_back(static_cast<uint8_t>(Composer::type));
+
+        const uint256_t domain = key->domain.domain;
+        const uint256_t generator = key->domain.generator;
+        const uint256_t num_public_inputs = key->num_public_inputs;
+
+        ASSERT(domain < (uint256_t(1) << 32));
+        ASSERT(generator < (uint256_t(1) << 16));
+        ASSERT(num_public_inputs < (uint256_t(1) << 32));
+
+        write(preimage_data, static_cast<uint16_t>(uint256_t(key->domain.generator)));
+        write(preimage_data, static_cast<uint32_t>(uint256_t(key->domain.domain)));
+        write(preimage_data, static_cast<uint32_t>(key->num_public_inputs));
         for (const auto& [tag, selector] : key->commitments) {
-            const auto x_limbs = split_bigfield_limbs(selector.x);
-            const auto y_limbs = split_bigfield_limbs(selector.y);
-
-            preimage_data.push_back(x_limbs[0]);
-            preimage_data.push_back(x_limbs[1]);
-            preimage_data.push_back(x_limbs[2]);
-            preimage_data.push_back(x_limbs[3]);
-
-            preimage_data.push_back(y_limbs[0]);
-            preimage_data.push_back(y_limbs[1]);
-            preimage_data.push_back(y_limbs[2]);
-            preimage_data.push_back(y_limbs[3]);
+            write(preimage_data, selector.y);
+            write(preimage_data, selector.x);
         }
 
+        write(preimage_data, key->domain.root);
+
         barretenberg::fr compressed_key;
         if constexpr (Composer::type == ComposerType::PLOOKUP) {
-            compressed_key = crypto::pedersen_commitment::lookup::compress_native(preimage_data, hash_index);
+            compressed_key =
+                from_buffer<barretenberg::fr>(crypto::pedersen_commitment::lookup::compress_native(preimage_data));
         } else {
-            compressed_key = crypto::pedersen_commitment::compress_native(preimage_data, hash_index);
+            compressed_key = crypto::pedersen_commitment::compress_native(preimage_data);
         }
         return compressed_key;
     }
diff --git a/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp b/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp
index aa185e34d7..009e732b20 100644
--- a/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp
+++ b/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.hpp
@@ -37,7 +37,6 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
                                typename Curve::fr_ct& batch_opening_scalar)
 {
     using fr_ct = typename Curve::fr_ct;
-    using g1_ct = typename Curve::g1_ct;
     const auto& polynomial_manifest = key->polynomial_manifest;
     for (size_t i = 0; i < key->polynomial_manifest.size(); ++i) {
         const auto& item = polynomial_manifest[i];
@@ -45,14 +44,14 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
         const std::string poly_label(item.polynomial_label);
         switch (item.source) {
         case PolynomialSource::WITNESS: {
-            const auto element = transcript.get_group_element(label);
-            ASSERT(element.on_curve());
-            if (element.is_point_at_infinity()) {
+            // get_circuit_group_element validates that the point produced lies on the curve
+            const auto element = transcript.get_circuit_group_element(label);
+            ASSERT(element.get_value().on_curve());
+            if (element.get_value().is_point_at_infinity()) {
                 std::cerr << label << " witness is point at infinity! Error!" << std::endl;
                 ctx->failure("witness " + label + " is point at infinity");
             }
-            // g1_ct::from_witness validates that the point produced lies on the curve
-            kate_g1_elements.insert({ label, g1_ct::from_witness(ctx, element) });
+            kate_g1_elements.insert({ label, element });
             break;
         }
         case PolynomialSource::SELECTOR:
@@ -89,15 +88,15 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
     fr_ct z_power = 1;
     for (size_t i = 0; i < program_settings::program_width; ++i) {
         std::string quotient_label = "T_" + std::to_string(i + 1);
-        const auto element = transcript.get_group_element(quotient_label);
+        const auto element = transcript.get_circuit_group_element(quotient_label);
 
-        kate_g1_elements.insert({ quotient_label, g1_ct::from_witness(ctx, element) });
+        kate_g1_elements.insert({ quotient_label, element });
         kate_fr_elements_at_zeta_large.insert({ quotient_label, quotient_nu * z_power });
         z_power *= key->z_pow_n;
     }
 
-    const auto PI_Z = transcript.get_group_element("PI_Z");
-    const auto PI_Z_OMEGA = transcript.get_group_element("PI_Z_OMEGA");
+    const auto PI_Z = transcript.get_circuit_group_element("PI_Z");
+    const auto PI_Z_OMEGA = transcript.get_circuit_group_element("PI_Z_OMEGA");
 
     fr_ct u = transcript.get_challenge_field_element("separator", 0);
 
@@ -105,10 +104,10 @@ void populate_kate_element_map(typename Curve::Composer* ctx,
         proof_system::plonk::compute_kate_batch_evaluation<fr_ct, Transcript, program_settings>(key, transcript);
     batch_opening_scalar = -batch_evaluation;
 
-    kate_g1_elements.insert({ "PI_Z_OMEGA", g1_ct::from_witness(ctx, PI_Z_OMEGA) });
+    kate_g1_elements.insert({ "PI_Z_OMEGA", PI_Z_OMEGA });
     kate_fr_elements_at_zeta_large.insert({ "PI_Z_OMEGA", zeta * key->domain.root * u });
 
-    kate_g1_elements.insert({ "PI_Z", g1_ct::from_witness(ctx, PI_Z) });
+    kate_g1_elements.insert({ "PI_Z", PI_Z });
     kate_fr_elements_at_zeta.insert({ "PI_Z", zeta });
 }
 
@@ -287,18 +286,6 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
 
     for (const auto& [label, fr_value] : kate_fr_elements_at_zeta_omega) {
         const auto& g1_value = kate_g1_elements[label];
-        // if (fr_value.get_value() == 0 && fr_value.witness_index != IS_CONSTANT   )
-        // {
-        //     std::cerr << "bad scalar zero at " << label << std::endl;
-        // }
-        // if (fr_value.get_value() == 0 && fr_value.witness_index == IS_CONSTANT) {
-        //     std::cerr << "scalar zero at " << label << std::endl;
-        //     continue;
-        // }
-
-        // if (fr_value.get_value() == 0 && fr_value.witness_index == IS_CONSTANT) {
-        //     continue;
-        // }
         double_opening_scalars.emplace_back(fr_value);
         double_opening_elements.emplace_back(g1_value);
     }
@@ -320,8 +307,7 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
         opening_elements.push_back(previous_output.P0);
         opening_scalars.push_back(random_separator);
 
-        rhs_elements.push_back(
-            (-(previous_output.P1)).reduce()); // TODO: use .normalize() instead? (As per defi bridge project)
+        rhs_elements.push_back((-(previous_output.P1)));
         rhs_scalars.push_back(random_separator);
     }
 
@@ -344,6 +330,10 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
                 const fr_ct l1 = public_inputs[idx1];
                 const fr_ct l2 = public_inputs[idx2];
                 const fr_ct l3 = public_inputs[idx3];
+                l0.create_range_constraint(fq_ct::NUM_LIMB_BITS, "l0");
+                l1.create_range_constraint(fq_ct::NUM_LIMB_BITS, "l1");
+                l2.create_range_constraint(fq_ct::NUM_LIMB_BITS, "l2");
+                l3.create_range_constraint(fq_ct::NUM_LAST_LIMB_BITS, "l3");
                 return fq_ct(l0, l1, l2, l3, false);
             };
 
@@ -369,7 +359,7 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
         opening_elements.push_back(g1_ct(x0, y0));
         opening_scalars.push_back(recursion_separator_challenge);
 
-        rhs_elements.push_back((-g1_ct(x1, y1)).normalize());
+        rhs_elements.push_back((-g1_ct(x1, y1)));
         rhs_scalars.push_back(recursion_separator_challenge);
     }
 
@@ -380,13 +370,13 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
     for (const auto& to_add : elements_to_add) {
         opening_result = opening_result + to_add;
     }
-    opening_result = opening_result.normalize();
 
     g1_ct rhs = g1_ct::template wnaf_batch_mul<128>(rhs_elements, rhs_scalars);
-    rhs = rhs + PI_Z;
-    rhs = (-rhs).normalize();
 
-    std::vector<uint32_t> proof_witness_indices{
+    rhs = (-rhs) - PI_Z;
+
+    // TODO(zac: remove this once a3-packages has migrated to calling `assign_object_to_proof_outputs`)
+    std::vector<uint32_t> proof_witness_indices = {
         opening_result.x.binary_basis_limbs[0].element.normalize().witness_index,
         opening_result.x.binary_basis_limbs[1].element.normalize().witness_index,
         opening_result.x.binary_basis_limbs[2].element.normalize().witness_index,
@@ -404,10 +394,10 @@ aggregation_state<Curve> verify_proof(typename Curve::Composer* context,
         rhs.y.binary_basis_limbs[2].element.normalize().witness_index,
         rhs.y.binary_basis_limbs[3].element.normalize().witness_index,
     };
-
-    return aggregation_state<Curve>{
-        opening_result, rhs, transcript.get_field_element_vector("public_inputs"), proof_witness_indices, true,
+    auto result = aggregation_state<Curve>{
+        opening_result, rhs, transcript.get_field_element_vector("public_inputs"), proof_witness_indices, true
     };
+    return result;
 }
 
 } // namespace recursion
diff --git a/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp b/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp
index 82e8042b02..db3dfc2cc1 100644
--- a/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp
+++ b/cpp/src/barretenberg/stdlib/recursion/verifier/verifier.test.cpp
@@ -212,6 +212,8 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
             stdlib::recursion::verify_proof<outer_curve, RecursiveSettings>(
                 &outer_composer, verification_key_b, recursive_manifest, recursive_proof_b, previous_output);
 
+        verification_key_b->compress();
+        verification_key->compress();
         return { output, verification_key };
     }
 
@@ -301,7 +303,7 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         EXPECT_EQ(outer_composer.failed(), false);
 
@@ -346,7 +348,7 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         EXPECT_EQ(outer_composer.failed(), false);
 
@@ -379,6 +381,9 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         InnerComposer inner_composer_a = InnerComposer("../srs_db/ignition");
         InnerComposer inner_composer_b = InnerComposer("../srs_db/ignition");
 
+        OuterComposer mid_composer_a = OuterComposer("../srs_db/ignition");
+        OuterComposer mid_composer_b = OuterComposer("../srs_db/ignition");
+
         OuterComposer outer_composer = OuterComposer("../srs_db/ignition");
 
         std::vector<barretenberg::fr> inner_inputs{ barretenberg::fr::random_element(),
@@ -388,7 +393,27 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         create_inner_circuit(inner_composer_a, inner_inputs);
         create_inner_circuit(inner_composer_b, inner_inputs);
 
-        auto circuit_output = create_double_outer_circuit(inner_composer_a, inner_composer_b, outer_composer);
+        auto circuit_output_a = create_outer_circuit(inner_composer_a, mid_composer_a);
+
+        uint256_t a0 = circuit_output_a.aggregation_state.P0.x.binary_basis_limbs[1].element.get_value();
+        uint256_t a1 = circuit_output_a.aggregation_state.P0.y.binary_basis_limbs[1].element.get_value();
+        uint256_t a2 = circuit_output_a.aggregation_state.P1.x.binary_basis_limbs[1].element.get_value();
+        uint256_t a3 = circuit_output_a.aggregation_state.P1.y.binary_basis_limbs[1].element.get_value();
+
+        ASSERT(a0.get_msb() <= 68);
+        ASSERT(a1.get_msb() <= 68);
+        ASSERT(a2.get_msb() <= 68);
+        ASSERT(a3.get_msb() <= 68);
+
+        circuit_output_a.aggregation_state.assign_object_to_proof_outputs();
+
+        auto circuit_output_b = create_outer_circuit(inner_composer_b, mid_composer_b);
+
+        circuit_output_b.aggregation_state.assign_object_to_proof_outputs();
+
+        auto circuit_output = create_double_outer_circuit(mid_composer_a, mid_composer_b, outer_composer);
+
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         g1::affine_element P[2];
         P[0].x = barretenberg::fq(circuit_output.aggregation_state.P0.x.get_value().lo);
@@ -398,8 +423,8 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         barretenberg::fq12 inner_proof_result = barretenberg::pairing::reduced_ate_pairing_batch_precomputed(
             P, circuit_output.verification_key->reference_string->get_precomputed_g2_lines(), 2);
 
-        EXPECT_EQ(circuit_output.aggregation_state.public_inputs[0].get_value(), inner_inputs[0]);
-        EXPECT_EQ(circuit_output.aggregation_state.public_inputs[1].get_value(), inner_inputs[1]);
+        EXPECT_EQ(circuit_output_a.aggregation_state.public_inputs[0].get_value(), inner_inputs[0]);
+        EXPECT_EQ(circuit_output_a.aggregation_state.public_inputs[1].get_value(), inner_inputs[1]);
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
@@ -456,7 +481,6 @@ template <typename OuterComposer> class stdlib_verifier : public testing::Test {
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
         printf("composer gates = %zu\n", outer_composer.get_num_gates());
-
         auto prover = outer_composer.create_prover();
 
         auto verifier = outer_composer.create_verifier();
@@ -638,14 +662,23 @@ HEAVY_TYPED_TEST(stdlib_verifier, recursive_proof_composition)
 
 HEAVY_TYPED_TEST(stdlib_verifier, recursive_proof_composition_ultra_no_tables)
 {
-    TestFixture::test_recursive_proof_composition_ultra_no_tables();
+    if constexpr (TypeParam::type == ComposerType::PLOOKUP) {
+        TestFixture::test_recursive_proof_composition_ultra_no_tables();
+    } else {
+        // no point running this if we're not in UltraPlonk
+        GTEST_SKIP();
+    }
 };
 
-// CircleCI can't cope with this.
-// HEAVY_TYPED_TEST(stdlib_verifier, double_verification)
-// {
-//     TestFixture::test_double_verification();
-// };
+HEAVY_TYPED_TEST(stdlib_verifier, double_verification)
+{
+    if constexpr (TypeParam::type == ComposerType::PLOOKUP) {
+        TestFixture::test_double_verification();
+    } else {
+        // CircleCI can't cope with non-ultraplonk version.
+        GTEST_SKIP();
+    }
+};
 
 HEAVY_TYPED_TEST(stdlib_verifier, recursive_proof_composition_with_variable_verification_key_a)
 {
diff --git a/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp b/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp
index 1f4410148b..36c128f3ea 100644
--- a/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp
+++ b/cpp/src/barretenberg/stdlib/recursion/verifier/verifier_turbo.test.cpp
@@ -188,7 +188,7 @@ template <typename OuterComposer> class stdlib_verifier_turbo : public testing::
 
         EXPECT_EQ(inner_proof_result, barretenberg::fq12::one());
 
-        circuit_output.aggregation_state.add_proof_outputs_as_public_inputs();
+        circuit_output.aggregation_state.assign_object_to_proof_outputs();
 
         EXPECT_EQ(outer_composer.failed(), false);
         std::cout << "creating prover" << std::endl;
diff --git a/cpp/src/barretenberg/transcript/transcript.cpp b/cpp/src/barretenberg/transcript/transcript.cpp
index 9c448949c9..8f597c454b 100644
--- a/cpp/src/barretenberg/transcript/transcript.cpp
+++ b/cpp/src/barretenberg/transcript/transcript.cpp
@@ -46,10 +46,12 @@ std::array<uint8_t, Keccak256Hasher::PRNG_OUTPUT_SIZE> Keccak256Hasher::hash(std
 
 std::array<uint8_t, Blake3sHasher::PRNG_OUTPUT_SIZE> Blake3sHasher::hash(std::vector<uint8_t> const& buffer)
 {
-    std::vector<uint8_t> hash_result = blake3::blake3s(buffer);
+    grumpkin::fq input = grumpkin::fq::serialize_from_buffer(&buffer[0]);
+    grumpkin::fq compressed = crypto::pedersen_commitment::compress_native({ input });
+    std::vector<uint8_t> res = to_buffer(compressed);
     std::array<uint8_t, PRNG_OUTPUT_SIZE> result;
     for (size_t i = 0; i < PRNG_OUTPUT_SIZE; ++i) {
-        result[i] = hash_result[i];
+        result[i] = res[i];
     }
     return result;
 }
@@ -59,10 +61,12 @@ std::array<uint8_t, Blake3sHasher::PRNG_OUTPUT_SIZE> Blake3sHasher::hash_plookup
     // TODO(@zac-williamson) Change to call a Poseidon hash and create a PoseidonHasher
     // (not making the name change right now as it will break concurrent work w. getting recursion working in Noir)
     // We also need to implement a Poseidon gadget
-    std::vector<uint8_t> compressed_buffer = crypto::pedersen_commitment::lookup::compress_native(buffer);
+    grumpkin::fq input = grumpkin::fq::serialize_from_buffer(&buffer[0]);
+    grumpkin::fq compressed = crypto::pedersen_commitment::lookup::compress_native({ input });
+    std::vector<uint8_t> res = to_buffer(compressed);
     std::array<uint8_t, PRNG_OUTPUT_SIZE> result;
     for (size_t i = 0; i < PRNG_OUTPUT_SIZE; ++i) {
-        result[i] = compressed_buffer[i];
+        result[i] = res[i];
     }
     return result;
 }
@@ -259,7 +263,11 @@ void Transcript::apply_fiat_shamir(const std::string& challenge_name /*, const b
     }
 
     std::vector<uint8_t> rolling_buffer(base_hash.begin(), base_hash.end());
-    rolling_buffer.push_back(0);
+    if (hasher == HashType::Keccak256) {
+        rolling_buffer.push_back(0);
+    } else {
+        rolling_buffer[31] = (0);
+    }
 
     // Compute how many hashes we need so that we have enough distinct chunks of 'random' bytes to distribute
     // across the num_challenges.
diff --git a/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol b/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol
index 4b118cdef6..748d20fcb9 100644
--- a/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol
+++ b/sol/src/ultra/keys/RecursiveUltraVerificationKey.sol
@@ -1,72 +1,72 @@
-// Verification Key Hash: 507de35addf16b79526d713259492d5d1764fdb6ce55ff4ccb03c147b72f381a
+// Verification Key Hash: b665bc769f274feb94ea7f9997fa684b414aa8b9b9bac0227c7ce2e1cbd3d115
 // SPDX-License-Identifier: Apache-2.0
 // Copyright 2022 Aztec
 pragma solidity >=0.8.4;
 
 library RecursiveUltraVerificationKey {
     function verificationKeyHash() internal pure returns (bytes32) {
-        return 0x507de35addf16b79526d713259492d5d1764fdb6ce55ff4ccb03c147b72f381a;
+        return 0xb665bc769f274feb94ea7f9997fa684b414aa8b9b9bac0227c7ce2e1cbd3d115;
     }
 
     function loadVerificationKey(uint256 _vk, uint256 _omegaInverseLoc) internal pure {
         assembly {
-            mstore(add(_vk, 0x00), 0x0000000000000000000000000000000000000000000000000000000000080000) // vk.circuit_size
+            mstore(add(_vk, 0x00), 0x0000000000000000000000000000000000000000000000000000000000040000) // vk.circuit_size
             mstore(add(_vk, 0x20), 0x0000000000000000000000000000000000000000000000000000000000000010) // vk.num_inputs
-            mstore(add(_vk, 0x40), 0x2260e724844bca5251829353968e4915305258418357473a5c1d597f613f6cbd) // vk.work_root
-            mstore(add(_vk, 0x60), 0x3064486657634403844b0eac78ca882cfd284341fcb0615a15cfcd17b14d8201) // vk.domain_inverse
-            mstore(add(_vk, 0x80), 0x18fe72968b540c1dad6c7648fcb3407edfc489d8dcf3fdce314c1f0e72684c43) // vk.Q1.x
-            mstore(add(_vk, 0xa0), 0x16f49263ee016852edfed2e84bf44c22b31064b9034b62059329b2af2f349c37) // vk.Q1.y
-            mstore(add(_vk, 0xc0), 0x1c382676d0f8e5691def3a60d533850f573c36aa200ab364c091acc4a7eb094f) // vk.Q2.x
-            mstore(add(_vk, 0xe0), 0x17c05ca7ea679681a3cf772fabcf2c1a988e39910f1ba8de3d1f68ffb0effda1) // vk.Q2.y
-            mstore(add(_vk, 0x100), 0x257d75dead2d8cbb2f63b3592a762a2c2dbe0195a533736fd01982370e768676) // vk.Q3.x
-            mstore(add(_vk, 0x120), 0x258b6d74446f5e532bce6e1a62372a82986eac9801c13a8553f373c30398a47c) // vk.Q3.y
-            mstore(add(_vk, 0x140), 0x290ff6a808f6abe7508a8c884ea0fc2f819e23a5b6d7c2dd1105da2a3f0637e0) // vk.Q4.x
-            mstore(add(_vk, 0x160), 0x2e6c3c419be44ed56b61069a06e980360f58830ad52b38bb69de92c456ebf0ca) // vk.Q4.y
-            mstore(add(_vk, 0x180), 0x282e6e14bbedfc7ef013feb4877ce9098389abfd3ad8899c957be4fdb20d0454) // vk.Q_M.x
-            mstore(add(_vk, 0x1a0), 0x2483d06975c3965d3f2d205ddeff196b90ca5883878bffc0bd190a357fee947e) // vk.Q_M.y
-            mstore(add(_vk, 0x1c0), 0x09af8fed71838d47b0052d8e3fdda11f55c62a6f2cb9aab24edd90b5e9640e9c) // vk.Q_C.x
-            mstore(add(_vk, 0x1e0), 0x2bdf7549fa146188dd750d032d9dec911c5799ca99f72405c4ac49f3f9e3a51a) // vk.Q_C.y
-            mstore(add(_vk, 0x200), 0x1479a535c87c413301d82c5ae1598b46c03117a57b878416d1143bb48f1df8bf) // vk.Q_ARITHMETIC.x
-            mstore(add(_vk, 0x220), 0x03203e3c02cc68282d93507d0ad9d56304d5a4b2908233bcb6f8682f8b264532) // vk.Q_ARITHMETIC.y
-            mstore(add(_vk, 0x240), 0x0cccd1de3f4ef2a2bfffbb7a91f8be2c49e9dc9b565ba4312015a88558f40d20) // vk.QSORT.x
-            mstore(add(_vk, 0x260), 0x092c5bd4edb996d6c1189a2682f6e93ede4b9aff7f07823605c894f833316718) // vk.QSORT.y
-            mstore(add(_vk, 0x280), 0x20089848d81ee4e8d7700679e7b5ed017916e2ee28bf76c0e0f4862274637bb8) // vk.Q_ELLIPTIC.x
-            mstore(add(_vk, 0x2a0), 0x0faae100924d24a70708e49a08ba2ba9df261088bf04e7b4c3f811cc0d8995fe) // vk.Q_ELLIPTIC.y
-            mstore(add(_vk, 0x2c0), 0x2de71f46452329536fe14dfff808692c405b9ef1ae47c451be8383ded868af5c) // vk.Q_AUX.x
-            mstore(add(_vk, 0x2e0), 0x0a520e2f877f19cc69aad2396bf741e6864a9f0b657887e80165b794f7612e71) // vk.Q_AUX.y
-            mstore(add(_vk, 0x300), 0x2779b1b7b8433eeee7333a1372feb4587da74e2c93cc54917e201748ed847204) // vk.SIGMA1.x
-            mstore(add(_vk, 0x320), 0x2198823f66ad59612f6cb77aff9437388abdbcc4d8f6eac792d8bca7d1b341d9) // vk.SIGMA1.y
-            mstore(add(_vk, 0x340), 0x1f6732b9d128931b2e32b2cae73b029720cca3cef23fee25363d520ed0ba3f92) // vk.SIGMA2.x
-            mstore(add(_vk, 0x360), 0x15fb336844e68b08361c10b83e7d6ea0f011958774e58e5f7c43e6606e989ecc) // vk.SIGMA2.y
-            mstore(add(_vk, 0x380), 0x0984b1b6c723afb4713656abf30b06e2ad04c054dd3acf016a6db1ee7111ca11) // vk.SIGMA3.x
-            mstore(add(_vk, 0x3a0), 0x03421d01f19c6b91e477648819f57d888b3b23b67599266293bddf91a2636ff1) // vk.SIGMA3.y
-            mstore(add(_vk, 0x3c0), 0x2f77cda90d366b151b17c5667f10526ab0fe144aecb307e00ede6039365bcfa0) // vk.SIGMA4.x
-            mstore(add(_vk, 0x3e0), 0x0d1e8f758babcbbf134dfe341c262ee25d0254cba8f5487ad5bddd190f27a9e8) // vk.SIGMA4.y
-            mstore(add(_vk, 0x400), 0x2f61a890b9f1dff4ef5c8b0eafe9b71c7a23dc4c8a6791d9c01418310f4a7b2e) // vk.TABLE1.x
-            mstore(add(_vk, 0x420), 0x07c8a51d1881fcdfe1cb7dcefc48a44047c7f5386797d5f8553ce2e12e8daba0) // vk.TABLE1.y
-            mstore(add(_vk, 0x440), 0x1adf56913dea23b7b14c952933b0b40fc476dc2697a758ec9df73802b0596c2f) // vk.TABLE2.x
-            mstore(add(_vk, 0x460), 0x212a1759e19285a35a70a245cca6477f89b6f156e4425cf52cfccb4594f59152) // vk.TABLE2.y
-            mstore(add(_vk, 0x480), 0x1527f8c19085ac209ebddbccae4dd0ca58b078e56fd20d651ce3a3194697b191) // vk.TABLE3.x
-            mstore(add(_vk, 0x4a0), 0x02247dca9c3cb09318aa6100a2a7c628281c69bc41cfda34aa72c263b69344b4) // vk.TABLE3.y
-            mstore(add(_vk, 0x4c0), 0x12eea56d2ada3befa5db215ea5ebbd37b5ce95fcd1cf7adb94d5a1784876b4f7) // vk.TABLE4.x
-            mstore(add(_vk, 0x4e0), 0x190df1146fbdd5cc79e8817ebcd6311e35cf5cc38795cee26371a707d685e05a) // vk.TABLE4.y
-            mstore(add(_vk, 0x500), 0x019b3a1970f9f77b13538cd8071ea3ee7c556fd98009e2a04be044ead0a94623) // vk.TABLE_TYPE.x
-            mstore(add(_vk, 0x520), 0x159cbdae3e194fe45524a171befdcb98b55c8d495fc463c98ac690eee947119f) // vk.TABLE_TYPE.y
-            mstore(add(_vk, 0x540), 0x16b2f7fa29f578aae3d4c0b8220101570adfcc9e8aa8a148267208540de189f1) // vk.ID1.x
-            mstore(add(_vk, 0x560), 0x2344a211fbbacc281de980197e4f12155d90d55a67f4ad08398bac665f813953) // vk.ID1.y
-            mstore(add(_vk, 0x580), 0x1af709df675db1688b95927324e71c5e551436ba7cb32478570a9cfaebf90614) // vk.ID2.x
-            mstore(add(_vk, 0x5a0), 0x2b83e76f61aa5cd70218c38e693ae0a99e9a2f4a192af5c77dbd27fa605fdae4) // vk.ID2.y
-            mstore(add(_vk, 0x5c0), 0x038c89635a8b6ec9766d5f98d13c16f8c312088f830610de72c00edf8c3b7800) // vk.ID3.x
-            mstore(add(_vk, 0x5e0), 0x1863d9217ba6c6764fa02298efe25fabfbe454a27431b970a6afff5d1986fadb) // vk.ID3.y
-            mstore(add(_vk, 0x600), 0x259a5dd47d44d6240407c26718201a122fb4b6b38d838f6e24d1c75515016761) // vk.ID4.x
-            mstore(add(_vk, 0x620), 0x14db344b735ffe084107e5cea07b00e4c41a82f0073f76e0536cd7118d78866f) // vk.ID4.y
+            mstore(add(_vk, 0x40), 0x19ddbcaf3a8d46c15c0176fbb5b95e4dc57088ff13f4d1bd84c6bfa57dcdc0e0) // vk.work_root
+            mstore(add(_vk, 0x60), 0x30644259cd94e7dd5045d7a27013b7fcd21c9e3b7fa75222e7bda49b729b0401) // vk.domain_inverse
+            mstore(add(_vk, 0x80), 0x16f7fc6133c8fb2dab06c57392df697a53357ecd918d749d1c981dcd0ee6d849) // vk.Q1.x
+            mstore(add(_vk, 0xa0), 0x2ba047103f9f86b84058d718a082e2faa53e50109e7cb880d2cbb7a1bf98da89) // vk.Q1.y
+            mstore(add(_vk, 0xc0), 0x1b9d146737dbb7759e0cad93ad4a7669880a062aceb7b46b8485327976d7285c) // vk.Q2.x
+            mstore(add(_vk, 0xe0), 0x11de7c3d638acc90e7f844c08658d0588da864268e00576d26aaca3cf49af350) // vk.Q2.y
+            mstore(add(_vk, 0x100), 0x1466840d8ad2dfde3a55d4c98412a05807bbe8aac33c27ba100c1e621fbebba0) // vk.Q3.x
+            mstore(add(_vk, 0x120), 0x2198ce44955b8ac6e21ddcbb66acd9df7596ad9e5fcf22f2227e8bbb51fe44ee) // vk.Q3.y
+            mstore(add(_vk, 0x140), 0x18b96a49db3644e2986f811b8c104e8eb88aa5eb9aec0ca109322a64885688bd) // vk.Q4.x
+            mstore(add(_vk, 0x160), 0x2ffec963826849cabd279a2b9f9a26f81518eb65d882f47a32470fc52f53def0) // vk.Q4.y
+            mstore(add(_vk, 0x180), 0x09dd725897471fddc177b241d7abc402705acfa452707388fa62666ad454598c) // vk.Q_M.x
+            mstore(add(_vk, 0x1a0), 0x03a46eb7ed69136e109e2761fb707da7cee18b3d05e581f24d77853b3b03581e) // vk.Q_M.y
+            mstore(add(_vk, 0x1c0), 0x304db51670cb2c59e3088431803e82bce8c81b38eefa267871ae2103ca7842ca) // vk.Q_C.x
+            mstore(add(_vk, 0x1e0), 0x1d7ec7d8d4a74e337de26b7adaecb8beb03d8cd647aa180bc08de840038710d5) // vk.Q_C.y
+            mstore(add(_vk, 0x200), 0x1db65122bf0f0a58fe07bd7342d3e26b07923041cb7d2158d13fb7b5328da40e) // vk.Q_ARITHMETIC.x
+            mstore(add(_vk, 0x220), 0x1691db1eeedbcb4f7646959cf363c00b7e26812a225edf5a6972d815270770f5) // vk.Q_ARITHMETIC.y
+            mstore(add(_vk, 0x240), 0x2a63b6a306e30d87f4b8597cbd1dcecff5fc7cacb774247fca6531e3d347ada4) // vk.QSORT.x
+            mstore(add(_vk, 0x260), 0x2849d2901fcd1f048924fb77e9451ad45d80f9f842418146b1fde0a7c752fc5f) // vk.QSORT.y
+            mstore(add(_vk, 0x280), 0x0e42866979ddac27ac729352dd0f844da4fb5a1c3e2480b5b940acd12304c700) // vk.Q_ELLIPTIC.x
+            mstore(add(_vk, 0x2a0), 0x017ac9a40547e866bdb914dc2b73661c0ec8aa67956c8c9bf406795f75e15c53) // vk.Q_ELLIPTIC.y
+            mstore(add(_vk, 0x2c0), 0x1ad08199bf79952adff0aa3a9c04a26f18ad7deed1fbed0548f2c83ddf913ef9) // vk.Q_AUX.x
+            mstore(add(_vk, 0x2e0), 0x151df9277b110c615c058f7f783105d03cab938f23884afed1897d0049715d21) // vk.Q_AUX.y
+            mstore(add(_vk, 0x300), 0x0bd26d62138b721fdc08fd7d52cd3dfaa37399eb416af0ec6237f9ec1a63a5c0) // vk.SIGMA1.x
+            mstore(add(_vk, 0x320), 0x103282cd2ef4210ac390d70a1cba58c6792a5d872ae0337615f8ac9997d300ef) // vk.SIGMA1.y
+            mstore(add(_vk, 0x340), 0x08abaa91c69ffa73d80d9a9562020c2a104771f07cf4099cbbe9a0071befb1cc) // vk.SIGMA2.x
+            mstore(add(_vk, 0x360), 0x1a82e5cd4a2c3de77afb2ca76c89b54991a4db3939a5c24806af01a0f69a2366) // vk.SIGMA2.y
+            mstore(add(_vk, 0x380), 0x26d50e2d19c429d1a2987d5249b88e388f93339fc05f52939fa2e1f4be653918) // vk.SIGMA3.x
+            mstore(add(_vk, 0x3a0), 0x0a49cd57e79633ea43cc3172e819327ce260682d8b571d0964678a153c17e959) // vk.SIGMA3.y
+            mstore(add(_vk, 0x3c0), 0x1c82f3e7c57b08ef90fda6fe39427b815a835c8559b64eac0a4b213998f6802c) // vk.SIGMA4.x
+            mstore(add(_vk, 0x3e0), 0x098bad014a270b6f5e4c90cbd299c15c5fd190457f0e78a5f849243e86688868) // vk.SIGMA4.y
+            mstore(add(_vk, 0x400), 0x215a055ec0bf7d7ab5e005b4260258aaadfd8ae9005a09060fdd0cee02dc3fea) // vk.TABLE1.x
+            mstore(add(_vk, 0x420), 0x1841eba177a34b1eb908727fe2e54bf33fc82b6e58dfd044acd4ba05ca80c837) // vk.TABLE1.y
+            mstore(add(_vk, 0x440), 0x018eb037682044ebf9cad76f777bf379b94c4d31d4351ce9677ff146a744555c) // vk.TABLE2.x
+            mstore(add(_vk, 0x460), 0x2bf87d72f0aef257c728503c900516f9274ab06eb54804651218438e40f06c25) // vk.TABLE2.y
+            mstore(add(_vk, 0x480), 0x13b003b384fb50e00994bf62a0057f44344be47383d59a7e9f1319d710ab5263) // vk.TABLE3.x
+            mstore(add(_vk, 0x4a0), 0x1a5f338a3d05fb46ea46855e6c36dbdb23c5f20a56acc795324fe2958189ec39) // vk.TABLE3.y
+            mstore(add(_vk, 0x4c0), 0x1365fd683dbad2c4c55b02dd33c4b96fde00e5bb3f52be20ead95484e130aee1) // vk.TABLE4.x
+            mstore(add(_vk, 0x4e0), 0x2da2ba1d27548e452cc863758acf156eb268f577b7d08ba58e7bbf2d28f6f23c) // vk.TABLE4.y
+            mstore(add(_vk, 0x500), 0x0ef908712f03ce2e4db3ef557abbde7c584d8c831165ba40ab43124526c53cc1) // vk.TABLE_TYPE.x
+            mstore(add(_vk, 0x520), 0x009dd642bc5eb1869048b59d2052645208cc5a14537814568d9c985c93319e55) // vk.TABLE_TYPE.y
+            mstore(add(_vk, 0x540), 0x0f973c9af1150675ae6dac1ea8ea366e5b8db13bb9c2237ab11c40dfb644ebf5) // vk.ID1.x
+            mstore(add(_vk, 0x560), 0x06b0c966f9edab490ac15a176d35d56996cc66854268197989a53ab0d1368188) // vk.ID1.y
+            mstore(add(_vk, 0x580), 0x09e719130bb46416efa070d08d82cc07fe0ed3bd8685616b92b4b9619e0807b2) // vk.ID2.x
+            mstore(add(_vk, 0x5a0), 0x18f35ee01438dda2443da27299404d09ccfff098a0ceac2e9a10bf2a96bc11ac) // vk.ID2.y
+            mstore(add(_vk, 0x5c0), 0x0cb835c737d324b9ff5bba45988dc4921104803b7e37649f8c628f0de26361ac) // vk.ID3.x
+            mstore(add(_vk, 0x5e0), 0x18ca0ac87859387aa32c6939f7a4a0d322879a3fdb1ef85d06addcddc13acea5) // vk.ID3.y
+            mstore(add(_vk, 0x600), 0x0047304b09efd9315a96d9e802c9a50c1964076026e5f17aff825d6cfc38d823) // vk.ID4.x
+            mstore(add(_vk, 0x620), 0x21c9f3aa4cbe8ee21422052f7c22d3d8a5a9a89c262a5a5cb52d8802f6106c49) // vk.ID4.y
             mstore(add(_vk, 0x640), 0x01) // vk.contains_recursive_proof
             mstore(add(_vk, 0x660), 0) // vk.recursive_proof_public_input_indices
             mstore(add(_vk, 0x680), 0x260e01b251f6f1c7e7ff4e580791dee8ea51d87a358e038b4efe30fac09383c1) // vk.g2_x.X.c1
             mstore(add(_vk, 0x6a0), 0x0118c4d5b837bcc2bc89b5b398b5974e9f5944073b32078b7e231fec938883b0) // vk.g2_x.X.c0
             mstore(add(_vk, 0x6c0), 0x04fc6369f7110fe3d25156c1bb9a72859cf2a04641f99ba4ee413c80da6a5fe4) // vk.g2_x.Y.c1
             mstore(add(_vk, 0x6e0), 0x22febda3c0c0632a56475b4214e5615e11e6dd3f96e6cea2854a87d4dacc5e55) // vk.g2_x.Y.c0
-            mstore(_omegaInverseLoc, 0x06e402c0a314fb67a15cf806664ae1b722dbc0efe66e6c81d98f9924ca535321) // vk.work_root_inverse
+            mstore(_omegaInverseLoc, 0x036853f083780e87f8d7c71d111119c57dbe118c22d5ad707a82317466c5174c) // vk.work_root_inverse
         }
     }
 }