Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "barretenberg/common/op_count.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/ecc/curves/bn254/bn254.hpp"
#include "barretenberg/numeric/random/engine.hpp"
#include "barretenberg/polynomials/polynomial.hpp"
#include "barretenberg/srs/global_crs.hpp"
#include <benchmark/benchmark.h>
Expand Down Expand Up @@ -466,6 +467,19 @@ void pippenger(State& state)
benchmark::DoNotOptimize(ck->commit(pol));
}
}

void bn254fr_random(State& state)
{
numeric::RNG& engine = numeric::get_randomness();
for (auto _ : state) {
state.PauseTiming();
size_t num_cycles = 1UL << static_cast<size_t>(state.range(0));
state.ResumeTiming();
for (size_t i = 0; i < num_cycles; i++) {
benchmark::DoNotOptimize(fr::random_element(&engine));
}
}
}
} // namespace

BENCHMARK(parallel_for_field_element_addition)->Unit(kMicrosecond)->DenseRange(0, MAX_REPETITION_LOG);
Expand All @@ -485,4 +499,5 @@ BENCHMARK(sequential_copy)->Unit(kMicrosecond)->DenseRange(20, 25);
BENCHMARK(uint_multiplication)->Unit(kMicrosecond)->DenseRange(12, 27);
BENCHMARK(uint_extended_multiplication)->Unit(kMicrosecond)->DenseRange(12, 27);
BENCHMARK(pippenger)->Unit(kMicrosecond)->DenseRange(16, 20)->Setup(DoPippengerSetup)->Iterations(5);
BENCHMARK(bn254fr_random)->Unit(kMicrosecond)->DenseRange(10, 20);
BENCHMARK_MAIN();
15 changes: 15 additions & 0 deletions barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fq.test.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "fq.hpp"
#include "barretenberg/numeric/random/engine.hpp"
#include "barretenberg/numeric/uint256/uint256.hpp"
#include "barretenberg/serialize/test_helper.hpp"
#include <gtest/gtest.h>

Expand Down Expand Up @@ -526,4 +528,17 @@ TEST(fq, NegAndSelfNeg0CmpRegression)
a_neg = 0;
a_neg.self_neg();
EXPECT_EQ((a == a_neg), true);
}

// This test shows that ((lo|hi)% modulus) in uint512_t is equivalent to (lo + 2^256 * hi) in field elements so we
// don't have to use the slow API (uint512_t' modulo operation)
TEST(fq, EquivalentRandomness)
{
auto& engine = numeric::get_debug_randomness();
uint512_t random_uint512 = engine.get_random_uint512();
auto random_lo = fq(random_uint512.lo);
auto random_hi = fq(random_uint512.hi);
uint512_t q(fq::modulus);
constexpr auto pow_2_256 = fq(uint256_t(1) << 128).sqr();
EXPECT_EQ(random_lo + pow_2_256 * random_hi, fq((random_uint512 % q).lo));
}
12 changes: 12 additions & 0 deletions barretenberg/cpp/src/barretenberg/ecc/curves/bn254/fr.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,4 +378,16 @@ TEST(fr, Uint256Conversions)

static_assert(a == c);
EXPECT_EQ(a, c);
}
// This test shows that ((lo|hi)% modulus) in uint512_t is equivalent to (lo + 2^256 * hi) in field elements so we
// don't have to use the slow API (uint512_t's modulo operation)
TEST(fr, EquivalentRandomness)
{
auto& engine = numeric::get_debug_randomness();
uint512_t random_uint512 = engine.get_random_uint512();
auto random_lo = fr(random_uint512.lo);
auto random_hi = fr(random_uint512.hi);
uint512_t r(fr::modulus);
constexpr auto pow_2_256 = fr(uint256_t(1) << 128).sqr();
EXPECT_EQ(random_lo + pow_2_256 * random_hi, fr((random_uint512 % r).lo));
}
18 changes: 13 additions & 5 deletions barretenberg/cpp/src/barretenberg/ecc/fields/field_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <vector>

#include "./field_declarations.hpp"
#include "barretenberg/numeric/uint256/uint256.hpp"

namespace bb {

Expand Down Expand Up @@ -687,11 +688,18 @@ template <class T> field<T> field<T>::random_element(numeric::RNG* engine) noexc
if (engine == nullptr) {
engine = &numeric::get_randomness();
}

uint512_t source = engine->get_random_uint512();
uint512_t q(modulus);
uint512_t reduced = source % q;
return field(reduced.lo);
constexpr field pow_2_256 = field(uint256_t(1) << 128).sqr();

@zac-williamson zac-williamson Oct 31, 2024

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How sensitive are we to performance here? If we wanted to maximize performance, I think the following algorithm would be fastest:

  1. generate random u512 t
  2. feed t directly into a Barrett reduction algorithm, producing output t*2^{-256} mod p
  3. multiply the output by 2^{512} mod p to return t * 2^{256} mod p (which is t mod p converted into Montgomery form, which is what the underlying field type expects)

The above requires 2 modular reductions. The current method performs 3, due to converting two random u256 values into field elements (along with additional arithmetic operations)

Very minor point, I don't know how performance sensitive we are to this algorithm

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our old modulo algorithm was super slow, so this is a significant improvement. I tried checking what kind of an improvement would not converting to montgomery for 256-bit chunks make and it was minimal. The problem is that now that the division algorithm is not extremely slow, the bottleneck is sampling the uint512_t.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also we don't do 2^256 montgomery in wasm. It's different(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the context about the wasm. Replace 2^{256} with whatever the montgomery R parameter is and I think the above all applies.

But yes I appreciate it doesn't affect performance too much, it's the difference between 3 reductions and 2. It makes sense that generating the random bytes would be the limiting factor after your changes.

field lo;
field hi;
*(uint256_t*)lo.data = engine->get_random_uint256();
*(uint256_t*)hi.data = engine->get_random_uint256();
lo.self_reduce_once();
lo.self_reduce_once();
lo.self_reduce_once();
hi.self_reduce_once();
hi.self_reduce_once();
hi.self_reduce_once();
return lo + (pow_2_256 * hi);
}

template <class T> constexpr size_t field<T>::primitive_root_log_size() noexcept
Expand Down