diff --git a/cpp/src/aztec/plonk/composer/ultra_composer.cpp b/cpp/src/aztec/plonk/composer/ultra_composer.cpp
index 3114533ecc..40f3ee1bbe 100644
--- a/cpp/src/aztec/plonk/composer/ultra_composer.cpp
+++ b/cpp/src/aztec/plonk/composer/ultra_composer.cpp
@@ -1238,8 +1238,18 @@ std::vector<uint32_t> UltraComposer::decompose_into_default_range(const uint32_t
     return sublimb_indices;
 }
 
+/**
+ * @brief Constrain a variable to a range
+ *
+ * @details Checks if the range [0, target_range] already exists. If it doesn't, then creates a new range. Then tags
+ * variable as belonging to this set.
+ *
+ * @param variable_index
+ * @param target_range
+ */
 void UltraComposer::create_new_range_constraint(const uint32_t variable_index, const uint64_t target_range)
 {
+    ASSERT(target_range != 0);
     if (range_lists.count(target_range) == 0) {
         range_lists.insert({ target_range, create_range_list(target_range) });
     }
@@ -1677,24 +1687,50 @@ void UltraComposer::apply_aux_selectors(const AUX_SELECTORS type)
  * Applies range constraints to two 70-bit limbs, splititng each into 5 14-bit sublimbs.
  * We can efficiently chain together two 70-bit limb checks in 3 gates, using auxiliary gates
  **/
-void UltraComposer::range_constrain_two_limbs(const uint32_t lo_idx, const uint32_t hi_idx)
+void UltraComposer::range_constrain_two_limbs(const uint32_t lo_idx,
+                                              const uint32_t hi_idx,
+                                              const size_t lo_limb_bits,
+                                              const size_t hi_limb_bits)
 {
-    constexpr uint64_t SUBLIMB_SIZE = 1ULL << 14;
+    // Validate limbs are <= 70 bits. If limbs are larger we require more witnesses and cannot use our limb accumulation
+    // custom gate
+    ASSERT(lo_limb_bits <= (14 * 5));
+    ASSERT(hi_limb_bits <= (14 * 5));
 
-    const auto get_sublimbs = [&](const uint32_t& limb_idx) {
+    // Sometimes we try to use limbs that are too large. It's easier to catch this issue here
+    const auto get_sublimbs = [&](const uint32_t& limb_idx, const std::array<uint64_t, 5>& sublimb_masks) {
         const uint256_t limb = get_variable(limb_idx);
-        constexpr uint256_t SUBLIMB_MASK = (uint256_t(1) << 14) - 1;
+        // we can use constant 2^14 - 1 mask here. If the sublimb value exceeds the expected value then witness will
+        // fail the range check below
+        // We also use zero_idx to substitute variables that should be zero
+        constexpr uint256_t MAX_SUBLIMB_MASK = (uint256_t(1) << 14) - 1;
         std::array<uint32_t, 5> sublimb_indices;
-        sublimb_indices[0] = add_variable(limb & SUBLIMB_MASK);
-        sublimb_indices[1] = add_variable((limb >> 14) & SUBLIMB_MASK);
-        sublimb_indices[2] = add_variable((limb >> 28) & SUBLIMB_MASK);
-        sublimb_indices[3] = add_variable((limb >> 42) & SUBLIMB_MASK);
-        sublimb_indices[4] = add_variable((limb >> 56) & SUBLIMB_MASK);
+        sublimb_indices[0] = sublimb_masks[0] != 0 ? add_variable(limb & MAX_SUBLIMB_MASK) : zero_idx;
+        sublimb_indices[1] = sublimb_masks[1] != 0 ? add_variable((limb >> 14) & MAX_SUBLIMB_MASK) : zero_idx;
+        sublimb_indices[2] = sublimb_masks[2] != 0 ? add_variable((limb >> 28) & MAX_SUBLIMB_MASK) : zero_idx;
+        sublimb_indices[3] = sublimb_masks[3] != 0 ? add_variable((limb >> 42) & MAX_SUBLIMB_MASK) : zero_idx;
+        sublimb_indices[4] = sublimb_masks[4] != 0 ? add_variable((limb >> 56) & MAX_SUBLIMB_MASK) : zero_idx;
         return sublimb_indices;
     };
 
-    const std::array<uint32_t, 5> lo_sublimbs = get_sublimbs(lo_idx);
-    const std::array<uint32_t, 5> hi_sublimbs = get_sublimbs(hi_idx);
+    const auto get_limb_masks = [](size_t limb_bits) {
+        std::array<uint64_t, 5> sublimb_masks;
+        sublimb_masks[0] = limb_bits >= 14 ? 14 : limb_bits;
+        sublimb_masks[1] = limb_bits >= 28 ? 14 : (limb_bits > 14 ? limb_bits - 14 : 0);
+        sublimb_masks[2] = limb_bits >= 42 ? 14 : (limb_bits > 28 ? limb_bits - 28 : 0);
+        sublimb_masks[3] = limb_bits >= 56 ? 14 : (limb_bits > 42 ? limb_bits - 42 : 0);
+        sublimb_masks[4] = (limb_bits > 56 ? limb_bits - 56 : 0);
+
+        for (auto& mask : sublimb_masks) {
+            mask = (1ULL << mask) - 1ULL;
+        }
+        return sublimb_masks;
+    };
+
+    const auto lo_masks = get_limb_masks(lo_limb_bits);
+    const auto hi_masks = get_limb_masks(hi_limb_bits);
+    const std::array<uint32_t, 5> lo_sublimbs = get_sublimbs(lo_idx, lo_masks);
+    const std::array<uint32_t, 5> hi_sublimbs = get_sublimbs(hi_idx, hi_masks);
 
     w_l.emplace_back(lo_sublimbs[0]);
     w_r.emplace_back(lo_sublimbs[1]);
@@ -1716,31 +1752,43 @@ void UltraComposer::range_constrain_two_limbs(const uint32_t lo_idx, const uint3
     apply_aux_selectors(AUX_SELECTORS::NONE);
     n += 3;
 
-    create_new_range_constraint(lo_sublimbs[0], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(lo_sublimbs[1], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(lo_sublimbs[2], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(lo_sublimbs[3], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(lo_sublimbs[4], SUBLIMB_SIZE - 1);
-
-    create_new_range_constraint(hi_sublimbs[0], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(hi_sublimbs[1], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(hi_sublimbs[2], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(hi_sublimbs[3], SUBLIMB_SIZE - 1);
-    create_new_range_constraint(hi_sublimbs[4], SUBLIMB_SIZE - 1);
+    for (size_t i = 0; i < 5; i++) {
+        if (lo_masks[i] != 0) {
+            create_new_range_constraint(lo_sublimbs[i], lo_masks[i]);
+        }
+        if (hi_masks[i] != 0) {
+            create_new_range_constraint(hi_sublimbs[i], hi_masks[i]);
+        }
+    }
 };
 
-std::array<uint32_t, 2> UltraComposer::decompose_non_native_field_double_width_limb(const uint32_t limb_idx)
+/**
+ * @brief Decompose a single witness into two, where the lowest is DEFAULT_NON_NATIVE_FIELD_LIMB_BITS (68) range
+ * constrained and the lowst is num_limb_bits - DEFAULT.. range constrained.
+ *
+ * @details Doesn't create gates constraining the limbs to each other.
+ *
+ * @param limb_idx The index of the limb that will be decomposed
+ * @param num_limb_bits The range we want to constrain the original limb to
+ * @return std::array<uint32_t, 2> The indices of new limbs.
+ */
+std::array<uint32_t, 2> UltraComposer::decompose_non_native_field_double_width_limb(const uint32_t limb_idx,
+                                                                                    const size_t num_limb_bits)
 {
-    constexpr barretenberg::fr LIMB_MASK = (uint256_t(1) << 68) - 1;
+    ASSERT(uint256_t(get_variable_reference(limb_idx)) < (uint256_t(1) << num_limb_bits));
+    constexpr barretenberg::fr LIMB_MASK = (uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS) - 1;
     const uint256_t value = get_variable(limb_idx);
     const uint256_t low = value & LIMB_MASK;
-    const uint256_t hi = value >> 68;
-    ASSERT(low + (hi << 68) == value);
+    const uint256_t hi = value >> DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
+    ASSERT(low + (hi << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS) == value);
 
     const uint32_t low_idx = add_variable(low);
     const uint32_t hi_idx = add_variable(hi);
 
-    range_constrain_two_limbs(low_idx, hi_idx);
+    ASSERT(num_limb_bits > DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    const size_t lo_bits = DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
+    const size_t hi_bits = num_limb_bits - DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
+    range_constrain_two_limbs(low_idx, hi_idx, lo_bits, hi_bits);
 
     return std::array<uint32_t, 2>{ low_idx, hi_idx };
 }
@@ -1758,9 +1806,7 @@ std::array<uint32_t, 2> UltraComposer::decompose_non_native_field_double_width_l
  * N.B. this method does NOT evaluate the prime field component of non-native field multiplications
  **/
 std::array<uint32_t, 2> UltraComposer::evaluate_non_native_field_multiplication(
-    const non_native_field_witnesses& input,
-    const bool range_constrain_quotient_and_remainder,
-    const bool range_constrain_outputs)
+    const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder)
 {
 
     std::array<fr, 4> a{
@@ -1788,11 +1834,13 @@ std::array<uint32_t, 2> UltraComposer::evaluate_non_native_field_multiplication(
         get_variable(input.r[3]),
     };
 
-    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << 68;
-    constexpr barretenberg::fr LIMB_SHIFT_2 = uint256_t(1) << 136;
-    constexpr barretenberg::fr LIMB_SHIFT_3 = uint256_t(1) << 204;
-    constexpr barretenberg::fr LIMB_RSHIFT = barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << 68);
-    constexpr barretenberg::fr LIMB_RSHIFT_2 = barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << 136);
+    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
+    constexpr barretenberg::fr LIMB_SHIFT_2 = uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    constexpr barretenberg::fr LIMB_SHIFT_3 = uint256_t(1) << (3 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    constexpr barretenberg::fr LIMB_RSHIFT =
+        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    constexpr barretenberg::fr LIMB_RSHIFT_2 =
+        barretenberg::fr(1) / barretenberg::fr(uint256_t(1) << (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
 
     barretenberg::fr lo_0 = a[0] * b[0] - r[0] + (a[1] * b[0] + a[0] * b[1]) * LIMB_SHIFT;
     barretenberg::fr lo_1 = (lo_0 + q[0] * input.neg_modulus[0] +
@@ -1914,20 +1962,14 @@ std::array<uint32_t, 2> UltraComposer::evaluate_non_native_field_multiplication(
         0,
     });
 
-    // Sometimes we may want to apply this step separately, after adding additional terms into lo_1 and hi_2
-    // For example, if we want to evaluate a field multiplication combined with several field additions.
-    if (range_constrain_outputs) {
-        range_constrain_two_limbs(hi_3_idx, lo_1_idx);
-    }
-
     return std::array<uint32_t, 2>{ lo_1_idx, hi_3_idx };
 }
 
 /**
  * Compute the limb-multiplication part of a non native field mul
  *
- * i.e. compute the low 204 and high 204 bit components of `a * b` where `a, b` are nnf elements composed of 4 68-bit
- *limbs
+ * i.e. compute the low 204 and high 204 bit components of `a * b` where `a, b` are nnf elements composed of 4
+ * limbs with size DEFAULT_NON_NATIVE_FIELD_LIMB_BITS
  *
  **/
 std::array<uint32_t, 2> UltraComposer::evaluate_partial_non_native_field_multiplication(
@@ -1947,7 +1989,7 @@ std::array<uint32_t, 2> UltraComposer::evaluate_partial_non_native_field_multipl
         get_variable(input.b[3]),
     };
 
-    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << 68;
+    constexpr barretenberg::fr LIMB_SHIFT = uint256_t(1) << DEFAULT_NON_NATIVE_FIELD_LIMB_BITS;
 
     barretenberg::fr lo_0 = a[0] * b[0] + (a[1] * b[0] + a[0] * b[1]) * LIMB_SHIFT;
 
diff --git a/cpp/src/aztec/plonk/composer/ultra_composer.hpp b/cpp/src/aztec/plonk/composer/ultra_composer.hpp
index b899e6b91c..98530b8870 100644
--- a/cpp/src/aztec/plonk/composer/ultra_composer.hpp
+++ b/cpp/src/aztec/plonk/composer/ultra_composer.hpp
@@ -17,9 +17,10 @@ class UltraComposer : public ComposerBase {
     // The plookup range proof requires work linear in range size, thus cannot be used directly for
     // large ranges such as 2^64. For such ranges the element will be decomposed into smaller
     // chuncks according to the parameter below
-    static constexpr size_t DEFAULT_PLOOKUP_RANGE_BITNUM = 9;
+    static constexpr size_t DEFAULT_PLOOKUP_RANGE_BITNUM = 14;
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_STEP_SIZE = 3;
     static constexpr size_t DEFAULT_PLOOKUP_RANGE_SIZE = (1 << DEFAULT_PLOOKUP_RANGE_BITNUM) - 1;
+    static constexpr size_t DEFAULT_NON_NATIVE_FIELD_LIMB_BITS = 68;
     static constexpr uint32_t UNINITIALIZED_MEMORY_RECORD = UINT32_MAX;
 
     struct non_native_field_witnesses {
@@ -313,12 +314,14 @@ class UltraComposer : public ComposerBase {
     /**
      * Non Native Field Arithmetic
      **/
-    void range_constrain_two_limbs(const uint32_t lo_idx, const uint32_t hi_idx);
-    std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(const uint32_t limb_idx);
+    void range_constrain_two_limbs(const uint32_t lo_idx,
+                                   const uint32_t hi_idx,
+                                   const size_t lo_limb_bits = DEFAULT_NON_NATIVE_FIELD_LIMB_BITS,
+                                   const size_t hi_limb_bits = DEFAULT_NON_NATIVE_FIELD_LIMB_BITS);
+    std::array<uint32_t, 2> decompose_non_native_field_double_width_limb(
+        const uint32_t limb_idx, const size_t num_limb_bits = (2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS));
     std::array<uint32_t, 2> evaluate_non_native_field_multiplication(
-        const non_native_field_witnesses& input,
-        const bool range_constrain_quotient_and_remainder = true,
-        const bool range_constrain_remainders = true);
+        const non_native_field_witnesses& input, const bool range_constrain_quotient_and_remainder = true);
     std::array<uint32_t, 2> evaluate_partial_non_native_field_multiplication(const non_native_field_witnesses& input);
     typedef std::pair<uint32_t, barretenberg::fr> scaled_witness;
     typedef std::tuple<scaled_witness, scaled_witness, barretenberg::fr> add_simple;
diff --git a/cpp/src/aztec/plonk/composer/ultra_composer.test.cpp b/cpp/src/aztec/plonk/composer/ultra_composer.test.cpp
index c7618744bf..ce04995c98 100644
--- a/cpp/src/aztec/plonk/composer/ultra_composer.test.cpp
+++ b/cpp/src/aztec/plonk/composer/ultra_composer.test.cpp
@@ -704,7 +704,8 @@ TEST(ultra_composer, non_native_field_multiplication)
     waffle::UltraComposer::non_native_field_witnesses inputs{
         a_indices, b_indices, q_indices, r_indices, modulus_limbs, fr(uint256_t(modulus)),
     };
-    composer.evaluate_non_native_field_multiplication(inputs);
+    const auto [lo_1_idx, hi_1_idx] = composer.evaluate_non_native_field_multiplication(inputs);
+    composer.range_constrain_two_limbs(lo_1_idx, hi_1_idx, 70, 70);
 
     auto prover = composer.create_prover();
     auto verifier = composer.create_verifier();
diff --git a/cpp/src/aztec/stdlib/encryption/ecdsa/ecdsa.test.cpp b/cpp/src/aztec/stdlib/encryption/ecdsa/ecdsa.test.cpp
index a84d954e77..1e1c2c8f19 100644
--- a/cpp/src/aztec/stdlib/encryption/ecdsa/ecdsa.test.cpp
+++ b/cpp/src/aztec/stdlib/encryption/ecdsa/ecdsa.test.cpp
@@ -13,7 +13,7 @@ namespace test_stdlib_ecdsa {
 using Composer = waffle::UltraComposer;
 using curve = stdlib::secp256k1<Composer>;
 
-HEAVY_TEST(stdlib_ecdsa, verify_signature)
+TEST(stdlib_ecdsa, verify_signature)
 {
     Composer composer = Composer();
 
diff --git a/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.hpp b/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.hpp
index 69fafc867f..1e03582c34 100644
--- a/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.hpp
+++ b/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.hpp
@@ -99,7 +99,10 @@ template <typename Composer, typename T> class bigfield {
     bigfield(const bigfield& other);
     bigfield(bigfield&& other);
 
-    static bigfield create_from_u512_as_witness(Composer* ctx, const uint512_t& value, const bool can_overflow = false);
+    static bigfield create_from_u512_as_witness(Composer* ctx,
+                                                const uint512_t& value,
+                                                const bool can_overflow = false,
+                                                const size_t maximum_bitlength = 0);
 
     static bigfield from_witness(Composer* ctx, const barretenberg::field<T>& input)
     {
@@ -166,6 +169,7 @@ template <typename Composer, typename T> class bigfield {
     uint512_t get_value() const;
     uint512_t get_maximum_value() const;
 
+    bigfield add_to_lower_limb(const field_t<Composer>& other, uint256_t other_maximum_value) const;
     bigfield operator+(const bigfield& other) const;
     bigfield operator-(const bigfield& other) const;
     bigfield operator*(const bigfield& other) const;
diff --git a/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.test.cpp b/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.test.cpp
index 9325a51e77..378ea9012f 100644
--- a/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.test.cpp
+++ b/cpp/src/aztec/stdlib/primitives/bigfield/bigfield.test.cpp
@@ -19,9 +19,9 @@
 #include <polynomials/polynomial_arithmetic.hpp>
 
 #define GET_COMPOSER_NAME_STRING(composer)                                                                             \
-    (typeid(composer) == typeid(waffle::StandardComposer) ? "StandardPlonk"                                            \
-     : typeid(composer) == typeid(waffle::TurboComposer)  ? "TurboPlonk"                                               \
-                                                          : "NULLPlonk")
+    (typeid(composer) == typeid(waffle::StandardComposer)                                                              \
+         ? "StandardPlonk"                                                                                             \
+         : typeid(composer) == typeid(waffle::TurboComposer) ? "TurboPlonk" : "NULLPlonk")
 
 namespace test_stdlib_bigfield {
 using namespace barretenberg;
@@ -103,7 +103,7 @@ template <typename Composer> class stdlib_bigfield : public testing::Test {
     static void test_mul()
     {
         auto composer = Composer();
-        size_t num_repetitions = 1;
+        size_t num_repetitions = 4;
         for (size_t i = 0; i < num_repetitions; ++i) {
             fq inputs[3]{ fq::random_element(), fq::random_element(), fq::random_element() };
             fq_ct a(witness_ct(&composer, fr(uint256_t(inputs[0]).slice(0, fq_ct::NUM_LIMB_BITS * 2))),
@@ -115,7 +115,8 @@ template <typename Composer> class stdlib_bigfield : public testing::Test {
             uint64_t before = composer.get_num_gates();
             fq_ct c = a * b;
             uint64_t after = composer.get_num_gates();
-            if (i == num_repetitions - 1) {
+            // Don't profile 1st repetition. It sets up a lookup table, cost is not representative of a typical mul
+            if (i == num_repetitions - 2) {
                 std::cerr << "num gates per mul = " << after - before << std::endl;
                 benchmark_info(GET_COMPOSER_NAME_STRING(Composer), "Bigfield", "MUL", "Gate Count", after - before);
             }
diff --git a/cpp/src/aztec/stdlib/primitives/bigfield/bigfield_impl.hpp b/cpp/src/aztec/stdlib/primitives/bigfield/bigfield_impl.hpp
index d32dbdd0df..105c2a7d3e 100644
--- a/cpp/src/aztec/stdlib/primitives/bigfield/bigfield_impl.hpp
+++ b/cpp/src/aztec/stdlib/primitives/bigfield/bigfield_impl.hpp
@@ -42,6 +42,13 @@ bigfield<C, T>::bigfield(const field_t<C>& low_bits_in,
                          const bool can_overflow,
                          const size_t maximum_bitlength)
 {
+    ASSERT((can_overflow == true && maximum_bitlength == 0) ||
+           (can_overflow == false && (maximum_bitlength == 0 || maximum_bitlength > (3 * NUM_LIMB_BITS))));
+
+    // Check that the values of two parts are within specified bounds
+    ASSERT(uint256_t(low_bits_in.get_value()) < (uint256_t(1) << (NUM_LIMB_BITS * 2)));
+    ASSERT(uint256_t(high_bits_in.get_value()) < (uint256_t(1) << (NUM_LIMB_BITS * 2)));
+
     context = low_bits_in.context == nullptr ? high_bits_in.context : low_bits_in.context;
     field_t<C> limb_0(context);
     field_t<C> limb_1(context);
@@ -101,8 +108,8 @@ bigfield<C, T>::bigfield(const field_t<C>& low_bits_in,
 
         std::vector<uint32_t> high_accumulator;
         if constexpr (C::type == waffle::PLOOKUP) {
-            const auto limb_witnesses =
-                context->decompose_non_native_field_double_width_limb(high_bits_in.normalize().witness_index);
+            const auto limb_witnesses = context->decompose_non_native_field_double_width_limb(
+                high_bits_in.normalize().witness_index, (size_t)num_high_limb_bits);
             limb_2.witness_index = limb_witnesses[0];
             limb_3.witness_index = limb_witnesses[1];
             field_t<C>::evaluate_linear_identity(high_bits_in, -limb_2, -limb_3 * shift_1, field_t<C>(0));
@@ -123,7 +130,13 @@ bigfield<C, T>::bigfield(const field_t<C>& low_bits_in,
     binary_basis_limbs[0] = Limb(limb_0, DEFAULT_MAXIMUM_LIMB);
     binary_basis_limbs[1] = Limb(limb_1, DEFAULT_MAXIMUM_LIMB);
     binary_basis_limbs[2] = Limb(limb_2, DEFAULT_MAXIMUM_LIMB);
-    binary_basis_limbs[3] = Limb(limb_3, can_overflow ? DEFAULT_MAXIMUM_LIMB : DEFAULT_MAXIMUM_MOST_SIGNIFICANT_LIMB);
+    if (maximum_bitlength > 0) {
+        uint256_t max_limb_value = (uint256_t(1) << (maximum_bitlength - (3 * NUM_LIMB_BITS))) - 1;
+        binary_basis_limbs[3] = Limb(limb_3, max_limb_value);
+    } else {
+        binary_basis_limbs[3] =
+            Limb(limb_3, can_overflow ? DEFAULT_MAXIMUM_LIMB : DEFAULT_MAXIMUM_MOST_SIGNIFICANT_LIMB);
+    }
     prime_basis_limb = low_bits_in + (high_bits_in * shift_2);
 }
 
@@ -147,9 +160,27 @@ bigfield<C, T>::bigfield(bigfield&& other)
     , prime_basis_limb(other.prime_basis_limb)
 {}
 
+/**
+ * @brief Creates a bigfield element from a uint512_t.
+ * Bigfield element is constructed as a witness and not a circuit constant
+ *
+ * @param ctx
+ * @param value
+ * @param can_overflow Can the input value have more than log2(modulus) bits?
+ * @param maximum_bitlength Provide the explicit maximum bitlength if known. Otherwise bigfield max value will be either
+ * log2(modulus) bits iff can_overflow = false, or (4 * NUM_LIMB_BITS) iff can_overflow = true
+ * @return bigfield<C, T>
+ *
+ * @details This method is 1 gate more efficient than constructing from 2 field_ct elements.
+ */
 template <typename C, typename T>
-bigfield<C, T> bigfield<C, T>::create_from_u512_as_witness(C* ctx, const uint512_t& value, const bool can_overflow)
+bigfield<C, T> bigfield<C, T>::create_from_u512_as_witness(C* ctx,
+                                                           const uint512_t& value,
+                                                           const bool can_overflow,
+                                                           const size_t maximum_bitlength)
 {
+    ASSERT((can_overflow == true && maximum_bitlength == 0) ||
+           (can_overflow == false && (maximum_bitlength == 0 || maximum_bitlength > (3 * NUM_LIMB_BITS))));
     std::array<uint256_t, 4> limbs;
     limbs[0] = value.slice(0, NUM_LIMB_BITS).lo;
     limbs[1] = value.slice(NUM_LIMB_BITS, NUM_LIMB_BITS * 2).lo;
@@ -179,8 +210,8 @@ bigfield<C, T> bigfield<C, T>::create_from_u512_as_witness(C* ctx, const uint512
                                    -1,
                                    0 },
                                  true);
-        ctx->range_constrain_two_limbs(limb_0.witness_index, limb_1.witness_index);
-        ctx->range_constrain_two_limbs(limb_2.witness_index, limb_3.witness_index);
+
+        uint64_t num_last_limb_bits = (can_overflow) ? NUM_LIMB_BITS : NUM_LAST_LIMB_BITS;
 
         bigfield result(ctx);
         result.binary_basis_limbs[0] = Limb(limb_0, DEFAULT_MAXIMUM_LIMB);
@@ -188,12 +219,26 @@ bigfield<C, T> bigfield<C, T>::create_from_u512_as_witness(C* ctx, const uint512
         result.binary_basis_limbs[2] = Limb(limb_2, DEFAULT_MAXIMUM_LIMB);
         result.binary_basis_limbs[3] =
             Limb(limb_3, can_overflow ? DEFAULT_MAXIMUM_LIMB : DEFAULT_MAXIMUM_MOST_SIGNIFICANT_LIMB);
+
+        // if maximum_bitlength is set, this supercedes can_overflow
+        if (maximum_bitlength > 0) {
+            ASSERT(maximum_bitlength > 3 * NUM_LIMB_BITS);
+            num_last_limb_bits = maximum_bitlength - (3 * NUM_LIMB_BITS);
+            uint256_t max_limb_value = (uint256_t(1) << num_last_limb_bits) - 1;
+            result.binary_basis_limbs[3].maximum_value = max_limb_value;
+        }
         result.prime_basis_limb = prime_limb;
+        ctx->range_constrain_two_limbs(
+            limb_0.witness_index, limb_1.witness_index, (size_t)NUM_LIMB_BITS, (size_t)NUM_LIMB_BITS);
+        ctx->range_constrain_two_limbs(
+            limb_2.witness_index, limb_3.witness_index, (size_t)NUM_LIMB_BITS, (size_t)num_last_limb_bits);
+
         return result;
     } else {
         return bigfield(witness_t(ctx, fr(limbs[0] + limbs[1] * shift_1)),
                         witness_t(ctx, fr(limbs[2] + limbs[3] * shift_1)),
-                        can_overflow);
+                        can_overflow,
+                        maximum_bitlength);
     }
 }
 
@@ -283,6 +328,40 @@ template <typename C, typename T> uint512_t bigfield<C, T>::get_maximum_value()
     return t0 + t1 + t2 + t3;
 }
 
+/**
+ * @brief Add a field element to the lower limb. CAUTION (the element has to be constrained before using this function)
+ *
+ * @details Sometimes we need to add a small constrained value to a bigfield element (for example, a boolean value), but
+ * we don't want to construct a full bigfield element for that as it would take too many gates. If the maximum value of
+ * the field element being added is small enough, we can simply add it to the lowest limb and increase its maximum
+ * value. That will create 2 additional constraints instead of 5/3 needed to add 2 bigfield elements and several needed
+ * to construct a bigfield element.
+ *
+ * @tparam C Composer
+ * @tparam T Field Parameters
+ * @param other Field element that will be added to the lower
+ * @param other_maximum_value The maximum value of other
+ * @return bigfield<C, T> Result
+ */
+template <typename C, typename T>
+bigfield<C, T> bigfield<C, T>::add_to_lower_limb(const field_t<C>& other, uint256_t other_maximum_value) const
+{
+    reduction_check();
+    ASSERT((other_maximum_value + binary_basis_limbs[0].maximum_value) <= get_maximum_unreduced_limb_value());
+    // needed cause a constant doesn't have a valid context
+    C* ctx = context ? context : other.context;
+
+    if (is_constant() && other.is_constant()) {
+        return bigfield(ctx, uint256_t((get_value() + uint256_t(other.get_value())) % modulus_u512));
+    }
+    bigfield result = *this;
+    result.binary_basis_limbs[0].maximum_value = binary_basis_limbs[0].maximum_value + other_maximum_value;
+
+    result.binary_basis_limbs[0].element = binary_basis_limbs[0].element + other;
+    result.prime_basis_limb = prime_basis_limb + other;
+    return result;
+}
+
 template <typename C, typename T> bigfield<C, T> bigfield<C, T>::operator+(const bigfield& other) const
 {
     reduction_check();
@@ -621,13 +700,8 @@ template <typename C, typename T> bigfield<C, T> bigfield<C, T>::operator*(const
             }
             return (*this).operator*(other);
         }
-        quotient = bigfield(witness_t(ctx, fr(quotient_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-                            witness_t(ctx, fr(quotient_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 4).lo)),
-                            false,
-                            num_quotient_bits);
-        remainder = bigfield(
-            witness_t(ctx, fr(remainder_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-            witness_t(ctx, fr(remainder_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 3 + NUM_LAST_LIMB_BITS).lo)));
+        quotient = create_from_u512_as_witness(ctx, quotient_value, false, num_quotient_bits);
+        remainder = create_from_u512_as_witness(ctx, remainder_value);
     };
 
     // Call `evaluate_multiply_add` to validate the correctness of our computed quotient and remainder
@@ -739,13 +813,9 @@ bigfield<C, T> bigfield<C, T>::internal_div(const std::vector<bigfield>& numerat
         if (check_for_zero) {
             denominator.assert_is_not_equal(zero());
         }
-        quotient = bigfield(witness_t(ctx, fr(quotient_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-                            witness_t(ctx, fr(quotient_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 4).lo)),
-                            false,
-                            num_quotient_bits);
-        inverse = bigfield(
-            witness_t(ctx, fr(inverse_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-            witness_t(ctx, fr(inverse_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 3 + NUM_LAST_LIMB_BITS).lo)));
+
+        quotient = create_from_u512_as_witness(ctx, quotient_value, false, num_quotient_bits);
+        inverse = create_from_u512_as_witness(ctx, inverse_value);
     }
 
     unsafe_evaluate_multiply_add(denominator, inverse, { unreduced_zero() }, quotient, numerators);
@@ -804,13 +874,8 @@ template <typename C, typename T> bigfield<C, T> bigfield<C, T>::sqr() const
             return sqr();
         }
 
-        quotient = bigfield(witness_t(ctx, fr(quotient_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-                            witness_t(ctx, fr(quotient_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 4).lo)),
-                            false,
-                            num_quotient_bits);
-        remainder = bigfield(
-            witness_t(ctx, fr(remainder_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-            witness_t(ctx, fr(remainder_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 3 + NUM_LAST_LIMB_BITS).lo)));
+        quotient = create_from_u512_as_witness(ctx, quotient_value, false, num_quotient_bits);
+        remainder = create_from_u512_as_witness(ctx, remainder_value);
     };
 
     unsafe_evaluate_square_add(*this, {}, quotient, remainder);
@@ -877,13 +942,8 @@ template <typename C, typename T> bigfield<C, T> bigfield<C, T>::sqradd(const st
         uint512_t quotient_value = quotient_1024.lo;
         uint256_t remainder_value = remainder_1024.lo.lo;
 
-        quotient = bigfield(witness_t(ctx, fr(quotient_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-                            witness_t(ctx, fr(quotient_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 4).lo)),
-                            false,
-                            num_quotient_bits);
-        remainder = bigfield(
-            witness_t(ctx, fr(remainder_value.slice(0, NUM_LIMB_BITS * 2))),
-            witness_t(ctx, fr(remainder_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 3 + NUM_LAST_LIMB_BITS))));
+        quotient = create_from_u512_as_witness(ctx, quotient_value, false, num_quotient_bits);
+        remainder = create_from_u512_as_witness(ctx, remainder_value);
     };
     unsafe_evaluate_square_add(*this, to_add, quotient, remainder);
     return remainder;
@@ -940,13 +1000,8 @@ bigfield<C, T> bigfield<C, T>::madd(const bigfield& to_mul, const std::vector<bi
             }
             return (*this).madd(to_mul, to_add);
         }
-        quotient = bigfield(witness_t(ctx, fr(quotient_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-                            witness_t(ctx, fr(quotient_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 4).lo)),
-                            false,
-                            num_quotient_bits);
-        remainder = bigfield(
-            witness_t(ctx, fr(remainder_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-            witness_t(ctx, fr(remainder_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 3 + NUM_LAST_LIMB_BITS).lo)));
+        quotient = create_from_u512_as_witness(ctx, quotient_value, false, num_quotient_bits);
+        remainder = create_from_u512_as_witness(ctx, remainder_value);
     };
     unsafe_evaluate_multiply_add(*this, to_mul, to_add, quotient, { remainder });
     return remainder;
@@ -1246,19 +1301,15 @@ bigfield<C, T> bigfield<C, T>::mult_madd(const std::vector<bigfield>& mul_left,
     bigfield remainder;
     bigfield quotient;
     // Constrain quotient to mitigate CRT overflow attacks
-    quotient = bigfield(witness_t(ctx, fr(quotient_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-                        witness_t(ctx, fr(quotient_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 4).lo)),
-                        false,
-                        num_quotient_bits);
+    quotient = create_from_u512_as_witness(ctx, quotient_value, false, num_quotient_bits);
+
     if (fix_remainder_to_zero) {
         remainder = zero();
         // remainder needs to be defined as wire value and not selector values to satisfy
         // UltraPlonk's bigfield custom gates
         remainder.convert_constant_to_witness(ctx);
     } else {
-        remainder = bigfield(
-            witness_t(ctx, fr(remainder_value.slice(0, NUM_LIMB_BITS * 2).lo)),
-            witness_t(ctx, fr(remainder_value.slice(NUM_LIMB_BITS * 2, NUM_LIMB_BITS * 3 + NUM_LAST_LIMB_BITS).lo)));
+        remainder = create_from_u512_as_witness(ctx, remainder_value);
     }
 
     unsafe_evaluate_multiple_multiply_add(new_input_left, new_input_right, new_to_add, quotient, { remainder });
@@ -1929,7 +1980,7 @@ void bigfield<C, T>::unsafe_evaluate_multiply_add(const bigfield& input_left,
             modulus,
         };
         // N.B. this method also evaluates the prime field component of the non-native field mul
-        const auto [lo_idx, hi_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false, false);
+        const auto [lo_idx, hi_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false);
 
         barretenberg::fr neg_prime = -barretenberg::fr(uint256_t(target_basis.modulus));
         field_t<C>::evaluate_polynomial_identity(left.prime_basis_limb,
@@ -1945,7 +1996,8 @@ void bigfield<C, T>::unsafe_evaluate_multiply_add(const bigfield& input_left,
         // if both the hi and lo output limbs have less than 70 bits, we can use our custom
         // limb accumulation gate (accumulates 2 field elements, each composed of 5 14-bit limbs, in 3 gates)
         if (carry_lo_msb <= 70 && carry_hi_msb <= 70) {
-            ctx->range_constrain_two_limbs(hi.witness_index, lo.witness_index);
+            ctx->range_constrain_two_limbs(
+                hi.witness_index, lo.witness_index, size_t(carry_lo_msb), size_t(carry_hi_msb));
         } else {
             ctx->decompose_into_default_range(hi.normalize().witness_index, carry_hi_msb);
             ctx->decompose_into_default_range(lo.normalize().witness_index, carry_lo_msb);
@@ -2365,7 +2417,7 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
             modulus,
         };
 
-        const auto [lo_1_idx, hi_1_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false, false);
+        const auto [lo_1_idx, hi_1_idx] = ctx->evaluate_non_native_field_multiplication(witnesses, false);
 
         barretenberg::fr neg_prime = -barretenberg::fr(uint256_t(target_basis.modulus));
 
@@ -2383,7 +2435,8 @@ void bigfield<C, T>::unsafe_evaluate_multiple_multiply_add(const std::vector<big
         // if both the hi and lo output limbs have less than 70 bits, we can use our custom
         // limb accumulation gate (accumulates 2 field elements, each composed of 5 14-bit limbs, in 3 gates)
         if (carry_lo_msb <= 70 && carry_hi_msb <= 70) {
-            ctx->range_constrain_two_limbs(hi.witness_index, lo.witness_index);
+            ctx->range_constrain_two_limbs(
+                hi.witness_index, lo.witness_index, (size_t)carry_lo_msb, (size_t)carry_hi_msb);
         } else {
             ctx->decompose_into_default_range(hi.normalize().witness_index, carry_hi_msb);
             ctx->decompose_into_default_range(lo.normalize().witness_index, carry_lo_msb);
diff --git a/cpp/src/aztec/stdlib/primitives/biggroup/biggroup_nafs.hpp b/cpp/src/aztec/stdlib/primitives/biggroup/biggroup_nafs.hpp
index c6c38cf045..b92f6750c5 100644
--- a/cpp/src/aztec/stdlib/primitives/biggroup/biggroup_nafs.hpp
+++ b/cpp/src/aztec/stdlib/primitives/biggroup/biggroup_nafs.hpp
@@ -125,143 +125,190 @@ typename element<C, Fq, Fr, G>::secp256k1_wnaf_pair element<C, Fq, Fr, G>::compu
 
     constexpr size_t num_bits = 129;
 
-    const auto compute_single_wnaf =
-        [ctx](const secp256k1::fr& k, const auto stagger, const bool is_negative, const bool is_lo = false) {
-            constexpr size_t num_rounds = ((num_bits + wnaf_size - 1) / wnaf_size);
-            const uint64_t stagger_mask = (1ULL << stagger) - 1;
-            const uint64_t stagger_scalar = k.data[0] & stagger_mask;
-
-            uint64_t wnaf_values[num_rounds] = { 0 };
-            bool skew_without_stagger;
-            uint256_t k_u256{ k.data[0], k.data[1], k.data[2], k.data[3] };
-            k_u256 = k_u256 >> stagger;
-            if (is_lo) {
-                barretenberg::wnaf::fixed_wnaf<num_bits - lo_stagger, 1, wnaf_size>(
-                    &k_u256.data[0], &wnaf_values[0], skew_without_stagger, 0);
-            } else {
-                barretenberg::wnaf::fixed_wnaf<num_bits - hi_stagger, 1, wnaf_size>(
-                    &k_u256.data[0], &wnaf_values[0], skew_without_stagger, 0);
-            }
-            const size_t num_rounds_adjusted = ((num_bits + wnaf_size - 1 - stagger) / wnaf_size);
-
-            const auto compute_staggered_wnaf_fragment =
-                [](const uint64_t fragment_u64, const uint64_t stagger, bool is_negative, bool wnaf_skew) {
-                    if (stagger == 0) {
-                        return std::make_pair<uint64_t, bool>((uint64_t)0, (bool)wnaf_skew);
-                    }
-                    int fragment = static_cast<int>(fragment_u64);
-
-                    if (is_negative) {
-                        fragment = -fragment;
-                    }
-                    if (!is_negative && wnaf_skew) {
-                        fragment -= (1 << stagger);
-                    } else if (is_negative && wnaf_skew) {
-                        fragment += (1 << stagger);
-                    }
-                    bool output_skew = (fragment_u64 % 2) == 0;
-                    if (!is_negative && output_skew) {
-                        fragment += 1;
-                    } else if (is_negative && output_skew) {
-                        fragment -= 1;
-                    }
-
-                    uint64_t output_fragment;
-                    if (fragment < 0) {
-                        output_fragment = static_cast<uint64_t>((int)((1ULL << (wnaf_size - 1))) + (fragment / 2 - 1));
-                    } else {
-                        output_fragment = static_cast<uint64_t>((1ULL << (wnaf_size - 1)) - 1ULL +
-                                                                (uint64_t)((uint64_t)fragment / 2 + 1));
-                    }
-
-                    return std::make_pair<uint64_t, bool>((uint64_t)output_fragment, (bool)output_skew);
-                };
-
-            const auto [first_fragment, skew] =
-                compute_staggered_wnaf_fragment(stagger_scalar, stagger, is_negative, skew_without_stagger);
-
-            constexpr uint64_t wnaf_window_size = (1ULL << (wnaf_size - 1));
-            const auto get_wnaf_wires = [ctx](uint64_t* wnaf_values, bool is_negative, size_t rounds) {
-                std::vector<field_t<C>> wnaf_entries;
-                for (size_t i = 0; i < rounds; ++i) {
-                    bool predicate = bool((wnaf_values[i] >> 31U) & 1U);
-                    uint64_t offset_entry;
-                    if ((!predicate && !is_negative) || (predicate && is_negative)) {
-                        offset_entry = wnaf_window_size + (wnaf_values[i] & 0xffffff);
-                    } else {
-                        offset_entry = wnaf_window_size - 1 - (wnaf_values[i] & 0xffffff);
-                    }
-                    field_t<C> entry(witness_t<C>(ctx, offset_entry));
-
-                    // TODO: Do these need to be range constrained? we use these witnesses
-                    // to index a size-16 ROM lookup table, which performs an implicit range constraint
-                    entry.create_range_constraint(wnaf_size);
-                    wnaf_entries.emplace_back(entry);
+    /**
+     * @brief Compute WNAF of a single 129-bit scalar
+     *
+     * @param k Scalar
+     * @param stagger The number of bits that are used in "staggering"
+     * @param is_negative If it should be subtracted
+     * @param is_lo True if it's the low scalar
+     */
+    const auto compute_single_wnaf = [ctx](const secp256k1::fr& k,
+                                           const auto stagger,
+                                           const bool is_negative,
+                                           const bool is_lo = false) {
+        // The number of rounds is the minimal required to cover the whole scalar with wnaf_size windows
+        constexpr size_t num_rounds = ((num_bits + wnaf_size - 1) / wnaf_size);
+        // Stagger mask is needed to retrieve the lowest bits that will not be used in montgomery ladder directly
+        const uint64_t stagger_mask = (1ULL << stagger) - 1;
+        // Stagger scalar represents the lower "staggered" bits that are not used in the ladder
+        const uint64_t stagger_scalar = k.data[0] & stagger_mask;
+
+        uint64_t wnaf_values[num_rounds] = { 0 };
+        bool skew_without_stagger;
+        uint256_t k_u256{ k.data[0], k.data[1], k.data[2], k.data[3] };
+        k_u256 = k_u256 >> stagger;
+        if (is_lo) {
+            barretenberg::wnaf::fixed_wnaf<num_bits - lo_stagger, 1, wnaf_size>(
+                &k_u256.data[0], &wnaf_values[0], skew_without_stagger, 0);
+        } else {
+            barretenberg::wnaf::fixed_wnaf<num_bits - hi_stagger, 1, wnaf_size>(
+                &k_u256.data[0], &wnaf_values[0], skew_without_stagger, 0);
+        }
+
+        // Number of rounds that are needed to reconstruct the scalar without staggered bits
+        const size_t num_rounds_excluding_stagger_bits = ((num_bits + wnaf_size - 1 - stagger) / wnaf_size);
+
+        /**
+         * @brief Compute the stagger-related part of WNAF and the final skew
+         *
+         * @param fragment_u64 Stagger-masked lower bits of the skalar
+         * @param stagger The number of staggering bits
+         * @param is_negative If the initial scalar is supposed to be subtracted
+         * @param wnaf_skew The skew of the stagger-right-shifted part of the skalar
+         *
+         */
+        const auto compute_staggered_wnaf_fragment =
+            [](const uint64_t fragment_u64, const uint64_t stagger, bool is_negative, bool wnaf_skew) {
+                // If there is not stagger then there is no need to change anyhing
+                if (stagger == 0) {
+                    return std::make_pair<uint64_t, bool>((uint64_t)0, (bool)wnaf_skew);
+                }
+                int fragment = static_cast<int>(fragment_u64);
+                // Inverse the fragment if it's negative
+                if (is_negative) {
+                    fragment = -fragment;
+                }
+                // If the value is positive and there is a skew in wnaf, subtract 2ˢᵗᵃᵍᵍᵉʳ. If negative and there is
+                // skew, then add
+                if (!is_negative && wnaf_skew) {
+                    fragment -= (1 << stagger);
+                } else if (is_negative && wnaf_skew) {
+                    fragment += (1 << stagger);
+                }
+                // If the lowest bit is zero, then set final skew to 1 and add 1 to the absolute value of the fragment
+                bool output_skew = (fragment_u64 % 2) == 0;
+                if (!is_negative && output_skew) {
+                    fragment += 1;
+                } else if (is_negative && output_skew) {
+                    fragment -= 1;
                 }
-                return wnaf_entries;
-            };
 
-            std::vector<field_t<C>> wnaf = get_wnaf_wires(&wnaf_values[0], is_negative, num_rounds_adjusted);
-            field_t<C> negative_skew = witness_t<C>(ctx, is_negative ? 0 : skew);
-            field_t<C> positive_skew = witness_t<C>(ctx, is_negative ? skew : 0);
-            negative_skew.create_range_constraint(1);
-            positive_skew.create_range_constraint(1);
-            (negative_skew + positive_skew).create_range_constraint(1);
-
-            const auto reconstruct_bigfield_from_wnaf = [ctx](const std::vector<field_t<C>>& wnaf,
-                                                              const field_t<C>& positive_skew,
-                                                              const field_t<C>& stagger_fragment,
-                                                              const size_t stagger,
-                                                              const size_t rounds) {
-                std::vector<field_t<C>> accumulator;
-                for (size_t i = 0; i < rounds; ++i) {
-                    field_t<C> entry = wnaf[rounds - 1 - i];
-                    entry *= 2;
-                    entry *= static_cast<field_t<C>>(uint256_t(1) << (i * wnaf_size));
-                    accumulator.emplace_back(entry);
+                uint64_t output_fragment;
+                if (fragment < 0) {
+                    output_fragment = static_cast<uint64_t>((int)((1ULL << (wnaf_size - 1))) + (fragment / 2 - 1));
+                } else {
+                    output_fragment = static_cast<uint64_t>((1ULL << (wnaf_size - 1)) - 1ULL +
+                                                            (uint64_t)((uint64_t)fragment / 2 + 1));
                 }
-                field_t<C> sum = field_t<C>::accumulate(accumulator);
-                sum = sum * field_t<C>(barretenberg::fr(1ULL << stagger));
-                sum += (stagger_fragment * 2);
-                sum += positive_skew;
-                sum = sum.normalize();
-                // TODO: improve efficiency by creating a constructor that does NOT require us to range constrain
-                //       limbs (we already know (sum < 2^{130}))
-                Fr reconstructed = Fr(sum, field_t<C>::from_witness_index(ctx, ctx->zero_idx), false);
-                return reconstructed;
-            };
 
-            field_t<C> stagger_fragment = witness_t<C>(ctx, first_fragment);
-            Fr wnaf_sum =
-                reconstruct_bigfield_from_wnaf(wnaf, positive_skew, stagger_fragment, stagger, num_rounds_adjusted);
+                return std::make_pair<uint64_t, bool>((uint64_t)output_fragment, (bool)output_skew);
+            };
 
-            uint256_t negative_constant_wnaf_offset(0);
+        // Compute the lowest fragment and final skew
+        const auto [first_fragment, skew] =
+            compute_staggered_wnaf_fragment(stagger_scalar, stagger, is_negative, skew_without_stagger);
+
+        constexpr uint64_t wnaf_window_size = (1ULL << (wnaf_size - 1));
+        /**
+         * @brief Compute wnaf values, convert them into witness field elements and range constrain them
+         *
+         */
+        const auto get_wnaf_wires = [ctx](uint64_t* wnaf_values, bool is_negative, size_t rounds) {
+            std::vector<field_t<C>> wnaf_entries;
+            for (size_t i = 0; i < rounds; ++i) {
+                // Predicate == sign of current wnaf value
+                bool predicate = bool((wnaf_values[i] >> 31U) & 1U);
+                uint64_t offset_entry;
+                // If the signs of current entry and the whole scalar are the same, then add the lowest bits of current
+                // wnaf value to the windows size to form an entry. Otherwise, subract the lowest bits along with 1
+                if ((!predicate && !is_negative) || (predicate && is_negative)) {
+                    // TODO: Why is this mask fixed?
+                    offset_entry = wnaf_window_size + (wnaf_values[i] & 0xffffff);
+                } else {
+                    offset_entry = wnaf_window_size - 1 - (wnaf_values[i] & 0xffffff);
+                }
+                field_t<C> entry(witness_t<C>(ctx, offset_entry));
 
-            for (size_t i = 0; i < num_rounds_adjusted; ++i) {
-                negative_constant_wnaf_offset +=
-                    uint256_t(wnaf_window_size * 2 - 1) * (uint256_t(1) << (i * wnaf_size));
+                // TODO: Do these need to be range constrained? we use these witnesses
+                // to index a size-16 ROM lookup table, which performs an implicit range constraint
+                entry.create_range_constraint(wnaf_size);
+                wnaf_entries.emplace_back(entry);
             }
-            negative_constant_wnaf_offset = negative_constant_wnaf_offset << stagger;
-            if (stagger > 0) {
-                negative_constant_wnaf_offset += ((1ULL << wnaf_size) - 1ULL); // FROM STAGGER FRAMGENT
+            return wnaf_entries;
+        };
+
+        // Get wnaf witnesses
+        std::vector<field_t<C>> wnaf = get_wnaf_wires(&wnaf_values[0], is_negative, num_rounds_excluding_stagger_bits);
+        // Compute and constrain skews
+        field_t<C> negative_skew = witness_t<C>(ctx, is_negative ? 0 : skew);
+        field_t<C> positive_skew = witness_t<C>(ctx, is_negative ? skew : 0);
+        negative_skew.create_range_constraint(1);
+        positive_skew.create_range_constraint(1);
+        (negative_skew + positive_skew).create_range_constraint(1);
+
+        const auto reconstruct_bigfield_from_wnaf = [ctx](const std::vector<field_t<C>>& wnaf,
+                                                          const field_t<C>& positive_skew,
+                                                          const field_t<C>& stagger_fragment,
+                                                          const size_t stagger,
+                                                          const size_t rounds) {
+            std::vector<field_t<C>> accumulator;
+            // Collect positive wnaf entries for accumulation
+            for (size_t i = 0; i < rounds; ++i) {
+                field_t<C> entry = wnaf[rounds - 1 - i];
+                entry *= static_cast<field_t<C>>(uint256_t(1) << (i * wnaf_size));
+                accumulator.emplace_back(entry);
             }
-            field_t<C> skew_offset =
-                (negative_skew + field_t<C>(barretenberg::fr(negative_constant_wnaf_offset))).normalize();
+            // Accumulate entries, shift by stagger and add the stagger itself
+            field_t<C> sum = field_t<C>::accumulate(accumulator);
+            sum = sum * field_t<C>(barretenberg::fr(1ULL << stagger));
+            sum += (stagger_fragment);
+            sum = sum.normalize();
+            // TODO: improve efficiency by creating a constructor that does NOT require us to range constrain
+            //       limbs (we already know (sum < 2^{130}))
+            // Convert this value to bigfield element
+            Fr reconstructed = Fr(sum, field_t<C>::from_witness_index(ctx, ctx->zero_idx), false);
+            // Double the final value and add the skew
+            reconstructed = (reconstructed + reconstructed).add_to_lower_limb(positive_skew, uint256_t(1));
+            return reconstructed;
+        };
 
-            // TODO: improve efficiency by removing range constraint on lo_offset and hi_offset (we already know are
-            // boolean)
-            Fr offset = Fr(skew_offset, field_t<C>(ctx, barretenberg::fr(0)), false);
+        // Initialize stagger witness
+        field_t<C> stagger_fragment = witness_t<C>(ctx, first_fragment);
 
-            Fr reconstructed = wnaf_sum - offset;
+        // Reconstruct bigfield x_pos
+        Fr wnaf_sum = reconstruct_bigfield_from_wnaf(
+            wnaf, positive_skew, stagger_fragment, stagger, num_rounds_excluding_stagger_bits);
 
-            secp256k1_wnaf wnaf_out{ .wnaf = wnaf,
-                                     .positive_skew = positive_skew,
-                                     .negative_skew = negative_skew,
-                                     .least_significant_wnaf_fragment = stagger_fragment,
-                                     .has_wnaf_fragment = (stagger > 0) };
+        // Start reconstructing x_neg
+        uint256_t negative_constant_wnaf_offset(0);
 
-            return std::make_pair<Fr, secp256k1_wnaf>((Fr)reconstructed, (secp256k1_wnaf)wnaf_out);
-        };
+        // Construct 0xF..F
+        for (size_t i = 0; i < num_rounds_excluding_stagger_bits; ++i) {
+            negative_constant_wnaf_offset += uint256_t(wnaf_window_size * 2 - 1) * (uint256_t(1) << (i * wnaf_size));
+        }
+        // Shift by stagger
+        negative_constant_wnaf_offset = negative_constant_wnaf_offset << stagger;
+        // Add for stagger
+        if (stagger > 0) {
+            negative_constant_wnaf_offset += ((1ULL << wnaf_size) - 1ULL); // FROM STAGGER FRAMGENT
+        }
+
+        // TODO: improve efficiency by removing range constraint on lo_offset and hi_offset (we already know are
+        // boolean)
+        // Add the skew to the bigfield constant
+        Fr offset = Fr(nullptr, negative_constant_wnaf_offset).add_to_lower_limb(negative_skew, uint256_t(1));
+        // x_pos - x_neg
+        Fr reconstructed = wnaf_sum - offset;
+
+        secp256k1_wnaf wnaf_out{ .wnaf = wnaf,
+                                 .positive_skew = positive_skew,
+                                 .negative_skew = negative_skew,
+                                 .least_significant_wnaf_fragment = stagger_fragment,
+                                 .has_wnaf_fragment = (stagger > 0) };
+
+        return std::make_pair<Fr, secp256k1_wnaf>((Fr)reconstructed, (secp256k1_wnaf)wnaf_out);
+    };
 
     secp256k1::fr k(scalar.get_value().lo);
     secp256k1::fr klo(0);
@@ -364,7 +411,7 @@ std::vector<field_t<C>> element<C, Fq, Fr, G>::compute_wnaf(const Fr& scalar)
         // If Fr is a non-native field element, we can't just accumulate the wnaf entries into a single value,
         // as we could overflow the circuit modulus
         //
-        // We add the first 34 wnaf entries into a 'low' 138-bit accumulator (138 = 2 68 bit limbs)
+        // We add the first 34 wnaf entries into a 'low' 136-bit accumulator (136 = 2 68 bit limbs)
         // We add the remaining wnaf entries into a 'high' accumulator
         // We can then directly construct a Fr element from the accumulators.
         // However we cannot underflow our accumulators, and our wnafs represent negative and positive values
@@ -372,8 +419,10 @@ std::vector<field_t<C>> element<C, Fq, Fr, G>::compute_wnaf(const Fr& scalar)
         // [-15, -13, -11, ..., 13, 15]
         //
         // To map from the raw value to the actual value, we must compute `value * 2 - 15`
-        // However, we do not subtract off the -15 term when constructing our low and high accumulators (instead just
-        // multiplying by 2) This ensures the low accumulator will not underflow
+        // However, we do not subtract off the -15 term when constructing our low and high accumulators. Instead of
+        // multiplying by two when accumulating we simply add the accumulated value to itself. This way it automatically
+        // updates multiplicative constants without computing new witnesses. This ensures the low accumulator will not
+        // underflow
         //
         // Once we hvae reconstructed an Fr element out of our accumulators,
         // we ALSO construct an Fr element from the constant offset terms we left out
@@ -383,7 +432,6 @@ std::vector<field_t<C>> element<C, Fq, Fr, G>::compute_wnaf(const Fr& scalar)
             std::vector<field_t<C>> half_accumulators;
             for (size_t i = 0; i < half_round_length; ++i) {
                 field_t<C> entry = wnafs[half_round_length - 1 - i];
-                entry *= 2;
                 entry *= static_cast<field_t<C>>(uint256_t(1) << (i * 4));
                 half_accumulators.emplace_back(entry);
             }
@@ -392,7 +440,6 @@ std::vector<field_t<C>> element<C, Fq, Fr, G>::compute_wnaf(const Fr& scalar)
         const size_t midpoint = num_rounds - (Fr::NUM_LIMB_BITS * 2) / WNAF_SIZE;
         auto hi_accumulators = reconstruct_half_wnaf(&wnaf_entries[0], midpoint);
         auto lo_accumulators = reconstruct_half_wnaf(&wnaf_entries[midpoint], num_rounds - midpoint);
-
         uint256_t negative_lo(0);
         uint256_t negative_hi(0);
         for (size_t i = 0; i < midpoint; ++i) {
@@ -401,12 +448,16 @@ std::vector<field_t<C>> element<C, Fq, Fr, G>::compute_wnaf(const Fr& scalar)
         for (size_t i = 0; i < (num_rounds - midpoint); ++i) {
             negative_lo += uint256_t(15) * (uint256_t(1) << (i * 4));
         }
-
+        ASSERT((num_rounds - midpoint) * 4 == 136);
+        // If skew == 1 lo_offset = 0, else = 0xf...f
         field_t<C> lo_offset =
-            (wnaf_entries[wnaf_entries.size() - 1] + field_t<C>(barretenberg::fr(negative_lo))).normalize();
-        Fr offset = Fr(lo_offset, field_t<C>(barretenberg::fr(negative_hi)), true);
+            (-field_t<C>(barretenberg::fr(negative_lo)))
+                .madd(wnaf_entries[wnaf_entries.size() - 1], field_t<C>(barretenberg::fr(negative_lo)))
+                .normalize();
+        Fr offset =
+            Fr(lo_offset, field_t<C>(barretenberg::fr(negative_hi)) + wnaf_entries[wnaf_entries.size() - 1], true);
         Fr reconstructed = Fr(lo_accumulators, hi_accumulators, true);
-        reconstructed = reconstructed - offset;
+        reconstructed = (reconstructed + reconstructed) - offset;
         reconstructed.assert_is_in_field();
         reconstructed.assert_equal(scalar);
     }