From dfefd6f3b27c125aec012f81ef14ba61e58cee75 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Wed, 5 Jul 2023 17:04:19 -0400 Subject: [PATCH 01/12] add flamegraph to benches --- benches/Cargo.toml | 1 + benches/src/argon2.rs | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 0a53e3ff..e36cbdba 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -9,6 +9,7 @@ publish = false [dev-dependencies] argon2 = { path = "../argon2" } criterion = { version = "0.4", features = ["html_reports"] } +pprof = { version = "0.11", features = ["flamegraph", "criterion"] } [[bench]] name = "argon2" diff --git a/benches/src/argon2.rs b/benches/src/argon2.rs index afa3849b..0007ef2c 100644 --- a/benches/src/argon2.rs +++ b/benches/src/argon2.rs @@ -1,5 +1,6 @@ use argon2::*; use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use pprof::criterion::{Output, PProfProfiler}; const BENCH_PASSWORD: &[u8] = b"hunter2"; const BENCH_SALT: &[u8] = b"pepper42"; @@ -80,7 +81,9 @@ fn bench_vary_p(c: &mut Criterion) { } criterion_group!( - benches, + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_default_params, bench_vary_m, bench_vary_t, From f0c0f0325acbb2623f711fc06b6cbe521e94fc13 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Wed, 5 Jul 2023 17:05:51 -0400 Subject: [PATCH 02/12] argon2: WIP: optimize with AVX2 SIMD --- Cargo.lock | 5 +- argon2/Cargo.toml | 1 + argon2/src/block.rs | 283 +++++++++++++++++++++++++++++++++++++++++- benches/src/argon2.rs | 2 +- 4 files changed, 287 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0b47cfb6..3c24ac24 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,7 @@ version = "0.5.0" dependencies = [ "base64ct", "blake2", + "cpufeatures", "hex-literal", "password-hash", "zeroize", @@ -107,9 +108,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03e69e28e9f7f77debdedbaafa2866e1de9ba56df55a8bd7cfc724c25a09987c" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" dependencies = [ "libc", ] diff --git a/argon2/Cargo.toml b/argon2/Cargo.toml index ed8c1c9f..db759ae0 100644 --- a/argon2/Cargo.toml +++ b/argon2/Cargo.toml @@ -18,6 +18,7 @@ rust-version = "1.65" [dependencies] base64ct = "1" blake2 = { version = "0.10.6", default-features = false } +cpufeatures = "0.2.9" # optional dependencies password-hash = { version = "0.5", optional = true } diff --git a/argon2/src/block.rs b/argon2/src/block.rs index 6586f25b..e4c0f33a 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -44,6 +44,174 @@ macro_rules! permute { }; } +const fn _MM_SHUFFLE2(z: i32, y: i32, x: i32, w: i32) -> i32 { + (z << 6) | (y << 4) | (x << 2) | w +} + +macro_rules! rotr32 { + ($x:expr) => { + _mm256_shuffle_epi32($x, _MM_SHUFFLE2(2, 3, 0, 1)) + }; +} + +macro_rules! rotr24 { + ($x:expr) => { + _mm256_shuffle_epi8( + $x, + _mm256_setr_epi8( + 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, + 12, 13, 14, 15, 8, 9, 10, + ), + ) + }; +} + +macro_rules! rotr16 { + ($x:expr) => { + _mm256_shuffle_epi8( + $x, + _mm256_setr_epi8( + 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, + 11, 12, 13, 14, 15, 8, 9, + ), + ) + }; +} + +macro_rules! rotr63 { + ($x:expr) => { + _mm256_xor_si256(_mm256_srli_epi64($x, 63), _mm256_add_epi64($x, $x)) + }; +} + +macro_rules! G1_AVX2 { + ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ + let ml = _mm256_mul_epu32($A0, $B0); + let ml = _mm256_add_epi64(ml, ml); + $A0 = _mm256_add_epi64($A0, _mm256_add_epi64($B0, ml)); + $D0 = _mm256_xor_si256($D0, $A0); + $D0 = rotr32!($D0); + let ml = _mm256_mul_epu32($C0, $D0); + let ml = _mm256_add_epi64(ml, ml); + $C0 = _mm256_add_epi64($C0, _mm256_add_epi64($D0, ml)); + $B0 = _mm256_xor_si256($B0, $C0); + $B0 = rotr24!($B0); + let ml = _mm256_mul_epu32($A1, $B1); + let ml = _mm256_add_epi64(ml, ml); + $A1 = _mm256_add_epi64($A1, _mm256_add_epi64($B1, ml)); + $D1 = _mm256_xor_si256($D1, $A1); + $D1 = rotr32!($D1); + let ml = _mm256_mul_epu32($C1, $D1); + let ml = _mm256_add_epi64(ml, ml); + $C1 = _mm256_add_epi64($C1, _mm256_add_epi64($D1, ml)); + $B1 = _mm256_xor_si256($B1, $C1); + $B1 = rotr24!($B1); + }}; +} + +macro_rules! G2_AVX2 { + ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ + let ml = _mm256_mul_epu32($A0, $B0); + let ml = _mm256_add_epi64(ml, ml); + $A0 = _mm256_add_epi64($A0, _mm256_add_epi64($B0, ml)); + $D0 = _mm256_xor_si256($D0, $A0); + $D0 = rotr16!($D0); + let ml = _mm256_mul_epu32($C0, $D0); + let ml = _mm256_add_epi64(ml, ml); + $C0 = _mm256_add_epi64($C0, _mm256_add_epi64($D0, ml)); + $B0 = _mm256_xor_si256($B0, $C0); + $B0 = rotr63!($B0); + let ml = _mm256_mul_epu32($A1, $B1); + let ml = _mm256_add_epi64(ml, ml); + $A1 = _mm256_add_epi64($A1, _mm256_add_epi64($B1, ml)); + $D1 = _mm256_xor_si256($D1, $A1); + $D1 = rotr16!($D1); + let ml = _mm256_mul_epu32($C1, $D1); + let ml = _mm256_add_epi64(ml, ml); + $C1 = _mm256_add_epi64($C1, _mm256_add_epi64($D1, ml)); + $B1 = _mm256_xor_si256($B1, $C1); + $B1 = rotr63!($B1); + }}; +} + +macro_rules! DIAGONALIZE_1 { + ($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{ + $B0 = _mm256_permute4x64_epi64($B0, _MM_SHUFFLE2(0, 3, 2, 1)); + $C0 = _mm256_permute4x64_epi64($C0, _MM_SHUFFLE2(1, 0, 3, 2)); + $D0 = _mm256_permute4x64_epi64($D0, _MM_SHUFFLE2(2, 1, 0, 3)); + $B1 = _mm256_permute4x64_epi64($B1, _MM_SHUFFLE2(0, 3, 2, 1)); + $C1 = _mm256_permute4x64_epi64($C1, _MM_SHUFFLE2(1, 0, 3, 2)); + $D1 = _mm256_permute4x64_epi64($D1, _MM_SHUFFLE2(2, 1, 0, 3)); + }}; +} + +macro_rules! DIAGONALIZE_2 { + ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ + let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC); + let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33); + $B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); + $B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + let tmp1 = $C0; + $C0 = $C1; + $C1 = tmp1; + let tmp1 = _mm256_blend_epi32($D0, $D1, 0xCC); + let tmp2 = _mm256_blend_epi32($D0, $D1, 0x33); + $D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); + $D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + }}; +} + +macro_rules! UNDIAGONALIZE_1 { + ($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{ + $B0 = _mm256_permute4x64_epi64($B0, _MM_SHUFFLE2(2, 1, 0, 3)); + $C0 = _mm256_permute4x64_epi64($C0, _MM_SHUFFLE2(1, 0, 3, 2)); + $D0 = _mm256_permute4x64_epi64($D0, _MM_SHUFFLE2(0, 3, 2, 1)); + $B1 = _mm256_permute4x64_epi64($B1, _MM_SHUFFLE2(2, 1, 0, 3)); + $C1 = _mm256_permute4x64_epi64($C1, _MM_SHUFFLE2(1, 0, 3, 2)); + $D1 = _mm256_permute4x64_epi64($D1, _MM_SHUFFLE2(0, 3, 2, 1)); + }}; +} + +macro_rules! UNDIAGONALIZE_2 { + ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ + let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC); + let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33); + $B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); + $B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + let tmp1 = $C0; + $C0 = $C1; + $C1 = tmp1; + let tmp1 = _mm256_blend_epi32($D0, $D1, 0x33); + let tmp2 = _mm256_blend_epi32($D0, $D1, 0xCC); + $D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); + $D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + }}; +} + +macro_rules! BLAKE2_ROUND_1 { + ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ + G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + DIAGONALIZE_1!($A0, $B0, $C0, $D0, $A1, $B1, $C1, $D1); + G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + UNDIAGONALIZE_1!($A0, $B0, $C0, $D0, $A1, $B1, $C1, $D1); + }}; +} + +macro_rules! BLAKE2_ROUND_2 { + ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ + G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + DIAGONALIZE_2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + UNDIAGONALIZE_2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); + }}; +} + +cpufeatures::new!(avx2_cpuid, "avx2"); + /// Structure for the (1 KiB) memory block implemented as 128 64-bit words. #[derive(Copy, Clone, Debug)] #[repr(align(64))] @@ -67,6 +235,17 @@ impl Block { } pub(crate) fn compress(rhs: &Self, lhs: &Self) -> Self { + #[cfg(any(target_arch = "x86_64"))] + { + let (_, avx2) = avx2_cpuid::init_get(); + if avx2 { + return unsafe { Self::compress_avx2(rhs, lhs) }; + } + } + Self::compress_safe(rhs, lhs) + } + + fn compress_safe(rhs: &Self, lhs: &Self) -> Self { let r = *rhs ^ lhs; // Apply permutations rowwise @@ -101,6 +280,108 @@ impl Block { q ^= &r; q } + + #[cfg(any(target_arch = "x86_64"))] + unsafe fn compress_avx2(rhs: &Self, lhs: &Self) -> Self { + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // extract the data into registers + let mut state = [ + _mm256_loadu_si256(rhs.0.as_ptr().offset(0 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(1 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(2 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(3 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(4 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(5 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(6 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(7 * 4) as *const __m256i), + ]; + + let mut block_xy = [ + _mm256_loadu_si256(lhs.0.as_ptr().offset(0 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(1 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(2 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(3 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(4 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(5 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(6 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(7 * 4) as *const __m256i), + ]; + + // xor registers + for i in 0..8 { + state[i] = _mm256_xor_si256(state[i], block_xy[i]); + } + + // for i in 0..4 { + // #[rustfmt::skip] + // BLAKE2_ROUND_1!( + // state[(i + 0) % 8], state[(i + 4) % 8], + // state[(i + 1) % 8], state[(i + 5) % 8], + // state[(i + 2) % 8], state[(i + 6) % 8], + // state[(i + 3) % 8], state[(i + 7) % 8] + // ); + // } + + // for i in 0..4 { + // BLAKE2_ROUND_2!( + // state[0 + i], + // state[1 + i], + // state[2 + i], + // state[3 + i], + // state[(4 + i) % 8], + // state[(5 + i) % 8], + // state[(6 + i) % 8], + // state[(7 + i) % 8] + // ); + // } + + // reapply registers + let mut r = Self::new(); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(0 * 4) as *mut __m256i, state[0]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(1 * 4) as *mut __m256i, state[1]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(2 * 4) as *mut __m256i, state[2]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(3 * 4) as *mut __m256i, state[3]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(4 * 4) as *mut __m256i, state[4]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(5 * 4) as *mut __m256i, state[5]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(6 * 4) as *mut __m256i, state[6]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(7 * 4) as *mut __m256i, state[7]); + + // let r = *rhs ^ lhs; + + // Apply permutations rowwise + let mut q = r; + for chunk in q.0.chunks_exact_mut(16) { + #[rustfmt::skip] + permute!( + chunk[0], chunk[1], chunk[2], chunk[3], + chunk[4], chunk[5], chunk[6], chunk[7], + chunk[8], chunk[9], chunk[10], chunk[11], + chunk[12], chunk[13], chunk[14], chunk[15], + ); + } + + // Apply permutations columnwise + for i in 0..8 { + let b = i * 2; + + #[rustfmt::skip] + permute!( + q.0[b], q.0[b + 1], + q.0[b + 16], q.0[b + 17], + q.0[b + 32], q.0[b + 33], + q.0[b + 48], q.0[b + 49], + q.0[b + 64], q.0[b + 65], + q.0[b + 80], q.0[b + 81], + q.0[b + 96], q.0[b + 97], + q.0[b + 112], q.0[b + 113], + ); + } + + q ^= &r; + q + } } impl Default for Block { @@ -132,7 +413,7 @@ impl BitXor<&Block> for Block { impl BitXorAssign<&Block> for Block { fn bitxor_assign(&mut self, rhs: &Block) { - for (dst, src) in self.0.iter_mut().zip(rhs.0.iter().copied()) { + for (dst, src) in self.0.iter_mut().zip(rhs.0.iter()) { *dst ^= src; } } diff --git a/benches/src/argon2.rs b/benches/src/argon2.rs index 0007ef2c..b26b6371 100644 --- a/benches/src/argon2.rs +++ b/benches/src/argon2.rs @@ -82,7 +82,7 @@ fn bench_vary_p(c: &mut Criterion) { criterion_group!( name = benches; - config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + config = Criterion::default().with_profiler(PProfProfiler::new(300, Output::Flamegraph(None))); targets = bench_default_params, bench_vary_m, From 87320dd546e17618c3cd46a8a1d5822294afaa06 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Sun, 9 Jul 2023 10:38:01 -0400 Subject: [PATCH 03/12] add a unit test for comparing compress_safe and compress_av2 --- argon2/src/block.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index e4c0f33a..7fa02e5c 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -214,6 +214,7 @@ cpufeatures::new!(avx2_cpuid, "avx2"); /// Structure for the (1 KiB) memory block implemented as 128 64-bit words. #[derive(Copy, Clone, Debug)] +#[cfg_attr(test, derive(PartialEq))] #[repr(align(64))] pub struct Block([u64; Self::SIZE / 8]); @@ -425,3 +426,32 @@ impl Zeroize for Block { self.0.zeroize(); } } + +#[cfg(test)] +mod test { + use super::*; + + #[cfg(target_arch = "x86_64")] + #[test] + fn compress_avx2() { + let lhs = Block([ + 0, 0, 0, 2048, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + let rhs = Block([ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]); + + let result = Block::compress_safe(&rhs, &lhs); + let result_av2 = unsafe { Block::compress_avx2(&rhs, &lhs) }; + + assert_eq!(result, result_av2); + } +} From 04f91874bf48833e495e7199f1ae945ba005dd50 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Sun, 9 Jul 2023 11:28:07 -0400 Subject: [PATCH 04/12] misc clean up --- argon2/src/block.rs | 151 +++++++++++++++++++++++--------------------- 1 file changed, 78 insertions(+), 73 deletions(-) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index 7fa02e5c..cecfaaf7 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -44,13 +44,13 @@ macro_rules! permute { }; } -const fn _MM_SHUFFLE2(z: i32, y: i32, x: i32, w: i32) -> i32 { +const fn _mm_shuffle2(z: i32, y: i32, x: i32, w: i32) -> i32 { (z << 6) | (y << 4) | (x << 2) | w } macro_rules! rotr32 { ($x:expr) => { - _mm256_shuffle_epi32($x, _MM_SHUFFLE2(2, 3, 0, 1)) + _mm256_shuffle_epi32($x, _mm_shuffle2(2, 3, 0, 1)) }; } @@ -136,12 +136,12 @@ macro_rules! G2_AVX2 { macro_rules! DIAGONALIZE_1 { ($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{ - $B0 = _mm256_permute4x64_epi64($B0, _MM_SHUFFLE2(0, 3, 2, 1)); - $C0 = _mm256_permute4x64_epi64($C0, _MM_SHUFFLE2(1, 0, 3, 2)); - $D0 = _mm256_permute4x64_epi64($D0, _MM_SHUFFLE2(2, 1, 0, 3)); - $B1 = _mm256_permute4x64_epi64($B1, _MM_SHUFFLE2(0, 3, 2, 1)); - $C1 = _mm256_permute4x64_epi64($C1, _MM_SHUFFLE2(1, 0, 3, 2)); - $D1 = _mm256_permute4x64_epi64($D1, _MM_SHUFFLE2(2, 1, 0, 3)); + $B0 = _mm256_permute4x64_epi64($B0, _mm_shuffle2(0, 3, 2, 1)); + $C0 = _mm256_permute4x64_epi64($C0, _mm_shuffle2(1, 0, 3, 2)); + $D0 = _mm256_permute4x64_epi64($D0, _mm_shuffle2(2, 1, 0, 3)); + $B1 = _mm256_permute4x64_epi64($B1, _mm_shuffle2(0, 3, 2, 1)); + $C1 = _mm256_permute4x64_epi64($C1, _mm_shuffle2(1, 0, 3, 2)); + $D1 = _mm256_permute4x64_epi64($D1, _mm_shuffle2(2, 1, 0, 3)); }}; } @@ -149,26 +149,26 @@ macro_rules! DIAGONALIZE_2 { ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC); let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33); - $B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); - $B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + $B1 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); + $B0 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); let tmp1 = $C0; $C0 = $C1; $C1 = tmp1; let tmp1 = _mm256_blend_epi32($D0, $D1, 0xCC); let tmp2 = _mm256_blend_epi32($D0, $D1, 0x33); - $D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); - $D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + $D0 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); + $D1 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); }}; } macro_rules! UNDIAGONALIZE_1 { ($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{ - $B0 = _mm256_permute4x64_epi64($B0, _MM_SHUFFLE2(2, 1, 0, 3)); - $C0 = _mm256_permute4x64_epi64($C0, _MM_SHUFFLE2(1, 0, 3, 2)); - $D0 = _mm256_permute4x64_epi64($D0, _MM_SHUFFLE2(0, 3, 2, 1)); - $B1 = _mm256_permute4x64_epi64($B1, _MM_SHUFFLE2(2, 1, 0, 3)); - $C1 = _mm256_permute4x64_epi64($C1, _MM_SHUFFLE2(1, 0, 3, 2)); - $D1 = _mm256_permute4x64_epi64($D1, _MM_SHUFFLE2(0, 3, 2, 1)); + $B0 = _mm256_permute4x64_epi64($B0, _mm_shuffle2(2, 1, 0, 3)); + $C0 = _mm256_permute4x64_epi64($C0, _mm_shuffle2(1, 0, 3, 2)); + $D0 = _mm256_permute4x64_epi64($D0, _mm_shuffle2(0, 3, 2, 1)); + $B1 = _mm256_permute4x64_epi64($B1, _mm_shuffle2(2, 1, 0, 3)); + $C1 = _mm256_permute4x64_epi64($C1, _mm_shuffle2(1, 0, 3, 2)); + $D1 = _mm256_permute4x64_epi64($D1, _mm_shuffle2(0, 3, 2, 1)); }}; } @@ -176,15 +176,15 @@ macro_rules! UNDIAGONALIZE_2 { ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC); let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33); - $B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); - $B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + $B0 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); + $B1 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); let tmp1 = $C0; $C0 = $C1; $C1 = tmp1; let tmp1 = _mm256_blend_epi32($D0, $D1, 0x33); let tmp2 = _mm256_blend_epi32($D0, $D1, 0xCC); - $D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1)); - $D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1)); + $D0 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); + $D1 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); }}; } @@ -299,7 +299,7 @@ impl Block { _mm256_loadu_si256(rhs.0.as_ptr().offset(7 * 4) as *const __m256i), ]; - let mut block_xy = [ + let block_xy = [ _mm256_loadu_si256(lhs.0.as_ptr().offset(0 * 4) as *const __m256i), _mm256_loadu_si256(lhs.0.as_ptr().offset(1 * 4) as *const __m256i), _mm256_loadu_si256(lhs.0.as_ptr().offset(2 * 4) as *const __m256i), @@ -315,28 +315,33 @@ impl Block { state[i] = _mm256_xor_si256(state[i], block_xy[i]); } - // for i in 0..4 { - // #[rustfmt::skip] - // BLAKE2_ROUND_1!( - // state[(i + 0) % 8], state[(i + 4) % 8], - // state[(i + 1) % 8], state[(i + 5) % 8], - // state[(i + 2) % 8], state[(i + 6) % 8], - // state[(i + 3) % 8], state[(i + 7) % 8] - // ); - // } + for i in 0..4 { + #[rustfmt::skip] + BLAKE2_ROUND_1!( + state[(i + 0) % 8], state[(i + 4) % 8], + state[(i + 1) % 8], state[(i + 5) % 8], + state[(i + 2) % 8], state[(i + 6) % 8], + state[(i + 3) % 8], state[(i + 7) % 8] + ); + } - // for i in 0..4 { - // BLAKE2_ROUND_2!( - // state[0 + i], - // state[1 + i], - // state[2 + i], - // state[3 + i], - // state[(4 + i) % 8], - // state[(5 + i) % 8], - // state[(6 + i) % 8], - // state[(7 + i) % 8] - // ); - // } + for i in 0..4 { + BLAKE2_ROUND_2!( + state[0 + i], + state[1 + i], + state[2 + i], + state[3 + i], + state[(4 + i) % 8], + state[(5 + i) % 8], + state[(6 + i) % 8], + state[(7 + i) % 8] + ); + } + + // xor registers + for i in 0..8 { + state[i] = _mm256_xor_si256(state[i], block_xy[i]); + } // reapply registers let mut r = Self::new(); @@ -349,39 +354,39 @@ impl Block { _mm256_storeu_si256(r.0.as_mut_ptr().offset(6 * 4) as *mut __m256i, state[6]); _mm256_storeu_si256(r.0.as_mut_ptr().offset(7 * 4) as *mut __m256i, state[7]); - // let r = *rhs ^ lhs; + // // Apply permutations rowwise + // let mut q = r; + // for chunk in q.0.chunks_exact_mut(16) { + // #[rustfmt::skip] + // permute!( + // chunk[0], chunk[1], chunk[2], chunk[3], + // chunk[4], chunk[5], chunk[6], chunk[7], + // chunk[8], chunk[9], chunk[10], chunk[11], + // chunk[12], chunk[13], chunk[14], chunk[15], + // ); + // } - // Apply permutations rowwise - let mut q = r; - for chunk in q.0.chunks_exact_mut(16) { - #[rustfmt::skip] - permute!( - chunk[0], chunk[1], chunk[2], chunk[3], - chunk[4], chunk[5], chunk[6], chunk[7], - chunk[8], chunk[9], chunk[10], chunk[11], - chunk[12], chunk[13], chunk[14], chunk[15], - ); - } + // // Apply permutations columnwise + // for i in 0..8 { + // let b = i * 2; - // Apply permutations columnwise - for i in 0..8 { - let b = i * 2; + // #[rustfmt::skip] + // permute!( + // q.0[b], q.0[b + 1], + // q.0[b + 16], q.0[b + 17], + // q.0[b + 32], q.0[b + 33], + // q.0[b + 48], q.0[b + 49], + // q.0[b + 64], q.0[b + 65], + // q.0[b + 80], q.0[b + 81], + // q.0[b + 96], q.0[b + 97], + // q.0[b + 112], q.0[b + 113], + // ); + // } - #[rustfmt::skip] - permute!( - q.0[b], q.0[b + 1], - q.0[b + 16], q.0[b + 17], - q.0[b + 32], q.0[b + 33], - q.0[b + 48], q.0[b + 49], - q.0[b + 64], q.0[b + 65], - q.0[b + 80], q.0[b + 81], - q.0[b + 96], q.0[b + 97], - q.0[b + 112], q.0[b + 113], - ); - } + // q ^= &r; + // q - q ^= &r; - q + r } } From 3a681ef75f5a257eabed99ae44831b84d2552aa6 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Sun, 9 Jul 2023 12:09:00 -0400 Subject: [PATCH 05/12] fix some blatently wrong math, whoops --- argon2/src/block.rs | 134 ++++++++++++++++++++++++++++---------------- 1 file changed, 87 insertions(+), 47 deletions(-) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index cecfaaf7..d46cd00b 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -287,7 +287,10 @@ impl Block { #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; - // extract the data into registers + // one u64 is 64 bits, so 4 u64s is 256 bits + // 256 bits * 32 = 8192 bits = 1024 bytes + + // extract the data into 32 256-bit registers let mut state = [ _mm256_loadu_si256(rhs.0.as_ptr().offset(0 * 4) as *const __m256i), _mm256_loadu_si256(rhs.0.as_ptr().offset(1 * 4) as *const __m256i), @@ -297,6 +300,30 @@ impl Block { _mm256_loadu_si256(rhs.0.as_ptr().offset(5 * 4) as *const __m256i), _mm256_loadu_si256(rhs.0.as_ptr().offset(6 * 4) as *const __m256i), _mm256_loadu_si256(rhs.0.as_ptr().offset(7 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(8 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(9 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(10 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(11 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(12 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(13 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(14 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(15 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(16 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(17 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(18 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(19 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(20 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(21 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(22 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(23 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(24 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(25 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(26 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(27 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(28 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(29 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(30 * 4) as *const __m256i), + _mm256_loadu_si256(rhs.0.as_ptr().offset(31 * 4) as *const __m256i), ]; let block_xy = [ @@ -308,38 +335,59 @@ impl Block { _mm256_loadu_si256(lhs.0.as_ptr().offset(5 * 4) as *const __m256i), _mm256_loadu_si256(lhs.0.as_ptr().offset(6 * 4) as *const __m256i), _mm256_loadu_si256(lhs.0.as_ptr().offset(7 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(8 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(9 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(10 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(11 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(12 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(13 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(14 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(15 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(16 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(17 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(18 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(19 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(20 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(21 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(22 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(23 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(24 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(25 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(26 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(27 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(28 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(29 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(30 * 4) as *const __m256i), + _mm256_loadu_si256(lhs.0.as_ptr().offset(31 * 4) as *const __m256i), ]; // xor registers - for i in 0..8 { + for i in 0..state.len() { state[i] = _mm256_xor_si256(state[i], block_xy[i]); } for i in 0..4 { #[rustfmt::skip] BLAKE2_ROUND_1!( - state[(i + 0) % 8], state[(i + 4) % 8], - state[(i + 1) % 8], state[(i + 5) % 8], - state[(i + 2) % 8], state[(i + 6) % 8], - state[(i + 3) % 8], state[(i + 7) % 8] + state[i + 0], state[i + 4], + state[i + 1], state[i + 5], + state[i + 2], state[i + 6], + state[i + 3], state[i + 7] ); } for i in 0..4 { + #[rustfmt::skip] BLAKE2_ROUND_2!( - state[0 + i], - state[1 + i], - state[2 + i], - state[3 + i], - state[(4 + i) % 8], - state[(5 + i) % 8], - state[(6 + i) % 8], - state[(7 + i) % 8] + state[0 + i], state[1 + i], + state[2 + i], state[3 + i], + state[4 + i], state[5 + i], + state[6 + i], state[7 + i] ); } // xor registers - for i in 0..8 { + for i in 0..state.len() { state[i] = _mm256_xor_si256(state[i], block_xy[i]); } @@ -353,38 +401,30 @@ impl Block { _mm256_storeu_si256(r.0.as_mut_ptr().offset(5 * 4) as *mut __m256i, state[5]); _mm256_storeu_si256(r.0.as_mut_ptr().offset(6 * 4) as *mut __m256i, state[6]); _mm256_storeu_si256(r.0.as_mut_ptr().offset(7 * 4) as *mut __m256i, state[7]); - - // // Apply permutations rowwise - // let mut q = r; - // for chunk in q.0.chunks_exact_mut(16) { - // #[rustfmt::skip] - // permute!( - // chunk[0], chunk[1], chunk[2], chunk[3], - // chunk[4], chunk[5], chunk[6], chunk[7], - // chunk[8], chunk[9], chunk[10], chunk[11], - // chunk[12], chunk[13], chunk[14], chunk[15], - // ); - // } - - // // Apply permutations columnwise - // for i in 0..8 { - // let b = i * 2; - - // #[rustfmt::skip] - // permute!( - // q.0[b], q.0[b + 1], - // q.0[b + 16], q.0[b + 17], - // q.0[b + 32], q.0[b + 33], - // q.0[b + 48], q.0[b + 49], - // q.0[b + 64], q.0[b + 65], - // q.0[b + 80], q.0[b + 81], - // q.0[b + 96], q.0[b + 97], - // q.0[b + 112], q.0[b + 113], - // ); - // } - - // q ^= &r; - // q + _mm256_storeu_si256(r.0.as_mut_ptr().offset(8 * 4) as *mut __m256i, state[8]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(9 * 4) as *mut __m256i, state[9]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(10 * 4) as *mut __m256i, state[10]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(11 * 4) as *mut __m256i, state[11]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(12 * 4) as *mut __m256i, state[12]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(13 * 4) as *mut __m256i, state[13]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(14 * 4) as *mut __m256i, state[14]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(15 * 4) as *mut __m256i, state[15]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(16 * 4) as *mut __m256i, state[16]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(17 * 4) as *mut __m256i, state[17]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(18 * 4) as *mut __m256i, state[18]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(19 * 4) as *mut __m256i, state[19]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(20 * 4) as *mut __m256i, state[20]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(21 * 4) as *mut __m256i, state[21]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(22 * 4) as *mut __m256i, state[22]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(23 * 4) as *mut __m256i, state[23]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(24 * 4) as *mut __m256i, state[24]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(25 * 4) as *mut __m256i, state[25]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(26 * 4) as *mut __m256i, state[26]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(27 * 4) as *mut __m256i, state[27]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(28 * 4) as *mut __m256i, state[28]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(29 * 4) as *mut __m256i, state[29]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(30 * 4) as *mut __m256i, state[30]); + _mm256_storeu_si256(r.0.as_mut_ptr().offset(31 * 4) as *mut __m256i, state[31]); r } From 814a367992b6bb881b52460933d6050fa7ee5986 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Sun, 9 Jul 2023 12:29:55 -0400 Subject: [PATCH 06/12] roll up a for loop --- argon2/src/block.rs | 38 ++++++-------------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index d46cd00b..227b7eb6 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -393,38 +393,12 @@ impl Block { // reapply registers let mut r = Self::new(); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(0 * 4) as *mut __m256i, state[0]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(1 * 4) as *mut __m256i, state[1]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(2 * 4) as *mut __m256i, state[2]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(3 * 4) as *mut __m256i, state[3]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(4 * 4) as *mut __m256i, state[4]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(5 * 4) as *mut __m256i, state[5]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(6 * 4) as *mut __m256i, state[6]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(7 * 4) as *mut __m256i, state[7]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(8 * 4) as *mut __m256i, state[8]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(9 * 4) as *mut __m256i, state[9]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(10 * 4) as *mut __m256i, state[10]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(11 * 4) as *mut __m256i, state[11]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(12 * 4) as *mut __m256i, state[12]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(13 * 4) as *mut __m256i, state[13]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(14 * 4) as *mut __m256i, state[14]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(15 * 4) as *mut __m256i, state[15]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(16 * 4) as *mut __m256i, state[16]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(17 * 4) as *mut __m256i, state[17]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(18 * 4) as *mut __m256i, state[18]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(19 * 4) as *mut __m256i, state[19]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(20 * 4) as *mut __m256i, state[20]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(21 * 4) as *mut __m256i, state[21]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(22 * 4) as *mut __m256i, state[22]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(23 * 4) as *mut __m256i, state[23]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(24 * 4) as *mut __m256i, state[24]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(25 * 4) as *mut __m256i, state[25]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(26 * 4) as *mut __m256i, state[26]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(27 * 4) as *mut __m256i, state[27]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(28 * 4) as *mut __m256i, state[28]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(29 * 4) as *mut __m256i, state[29]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(30 * 4) as *mut __m256i, state[30]); - _mm256_storeu_si256(r.0.as_mut_ptr().offset(31 * 4) as *mut __m256i, state[31]); + for i in 0..state.len() { + _mm256_storeu_si256( + r.0.as_mut_ptr().offset(i as isize * 4) as *mut __m256i, + state[i], + ); + } r } From a7a6321c8fddb25e132d6eb185b232ec578ec5bb Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Sun, 9 Jul 2023 13:56:53 -0400 Subject: [PATCH 07/12] fix more wrong math --- argon2/src/block.rs | 63 +++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 45 deletions(-) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index 227b7eb6..79eb9cc2 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -291,6 +291,7 @@ impl Block { // 256 bits * 32 = 8192 bits = 1024 bytes // extract the data into 32 256-bit registers + let mut state = [ _mm256_loadu_si256(rhs.0.as_ptr().offset(0 * 4) as *const __m256i), _mm256_loadu_si256(rhs.0.as_ptr().offset(1 * 4) as *const __m256i), @@ -326,69 +327,41 @@ impl Block { _mm256_loadu_si256(rhs.0.as_ptr().offset(31 * 4) as *const __m256i), ]; - let block_xy = [ - _mm256_loadu_si256(lhs.0.as_ptr().offset(0 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(1 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(2 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(3 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(4 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(5 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(6 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(7 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(8 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(9 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(10 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(11 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(12 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(13 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(14 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(15 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(16 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(17 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(18 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(19 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(20 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(21 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(22 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(23 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(24 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(25 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(26 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(27 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(28 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(29 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(30 * 4) as *const __m256i), - _mm256_loadu_si256(lhs.0.as_ptr().offset(31 * 4) as *const __m256i), - ]; + // because there are only 32 YMM registers, we need to do the xor immedately after loading to get the compiler to emit ymmword ptr - // xor registers for i in 0..state.len() { - state[i] = _mm256_xor_si256(state[i], block_xy[i]); + state[i] = _mm256_xor_si256( + state[i], + _mm256_loadu_si256(lhs.0.as_ptr().offset(i as isize * 4) as *const __m256i), + ); } for i in 0..4 { #[rustfmt::skip] BLAKE2_ROUND_1!( - state[i + 0], state[i + 4], - state[i + 1], state[i + 5], - state[i + 2], state[i + 6], - state[i + 3], state[i + 7] + state[8 * i + 0], state[8 * i + 4], + state[8 * i + 1], state[8 * i + 5], + state[8 * i + 2], state[8 * i + 6], + state[8 * i + 3], state[8 * i + 7] ); } for i in 0..4 { #[rustfmt::skip] BLAKE2_ROUND_2!( - state[0 + i], state[1 + i], - state[2 + i], state[3 + i], - state[4 + i], state[5 + i], - state[6 + i], state[7 + i] + state[0 + i], state[4 + i], + state[8 + i], state[12 + i], + state[16 + i], state[20 + i], + state[24 + i], state[28 + i] ); } // xor registers for i in 0..state.len() { - state[i] = _mm256_xor_si256(state[i], block_xy[i]); + state[i] = _mm256_xor_si256( + state[i], + _mm256_loadu_si256(lhs.0.as_ptr().offset(i as isize * 4) as *const __m256i), + ); } // reapply registers From 70e34def273b35a6d95f3e12c26cf9e700b54d57 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Mon, 10 Jul 2023 07:22:25 -0400 Subject: [PATCH 08/12] use an alternate method to get the compiler to emit simd code --- argon2/src/block.rs | 272 ++++---------------------------------------- 1 file changed, 25 insertions(+), 247 deletions(-) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index 79eb9cc2..d5dd4c7c 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -44,172 +44,6 @@ macro_rules! permute { }; } -const fn _mm_shuffle2(z: i32, y: i32, x: i32, w: i32) -> i32 { - (z << 6) | (y << 4) | (x << 2) | w -} - -macro_rules! rotr32 { - ($x:expr) => { - _mm256_shuffle_epi32($x, _mm_shuffle2(2, 3, 0, 1)) - }; -} - -macro_rules! rotr24 { - ($x:expr) => { - _mm256_shuffle_epi8( - $x, - _mm256_setr_epi8( - 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, - 12, 13, 14, 15, 8, 9, 10, - ), - ) - }; -} - -macro_rules! rotr16 { - ($x:expr) => { - _mm256_shuffle_epi8( - $x, - _mm256_setr_epi8( - 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, - 11, 12, 13, 14, 15, 8, 9, - ), - ) - }; -} - -macro_rules! rotr63 { - ($x:expr) => { - _mm256_xor_si256(_mm256_srli_epi64($x, 63), _mm256_add_epi64($x, $x)) - }; -} - -macro_rules! G1_AVX2 { - ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ - let ml = _mm256_mul_epu32($A0, $B0); - let ml = _mm256_add_epi64(ml, ml); - $A0 = _mm256_add_epi64($A0, _mm256_add_epi64($B0, ml)); - $D0 = _mm256_xor_si256($D0, $A0); - $D0 = rotr32!($D0); - let ml = _mm256_mul_epu32($C0, $D0); - let ml = _mm256_add_epi64(ml, ml); - $C0 = _mm256_add_epi64($C0, _mm256_add_epi64($D0, ml)); - $B0 = _mm256_xor_si256($B0, $C0); - $B0 = rotr24!($B0); - let ml = _mm256_mul_epu32($A1, $B1); - let ml = _mm256_add_epi64(ml, ml); - $A1 = _mm256_add_epi64($A1, _mm256_add_epi64($B1, ml)); - $D1 = _mm256_xor_si256($D1, $A1); - $D1 = rotr32!($D1); - let ml = _mm256_mul_epu32($C1, $D1); - let ml = _mm256_add_epi64(ml, ml); - $C1 = _mm256_add_epi64($C1, _mm256_add_epi64($D1, ml)); - $B1 = _mm256_xor_si256($B1, $C1); - $B1 = rotr24!($B1); - }}; -} - -macro_rules! G2_AVX2 { - ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ - let ml = _mm256_mul_epu32($A0, $B0); - let ml = _mm256_add_epi64(ml, ml); - $A0 = _mm256_add_epi64($A0, _mm256_add_epi64($B0, ml)); - $D0 = _mm256_xor_si256($D0, $A0); - $D0 = rotr16!($D0); - let ml = _mm256_mul_epu32($C0, $D0); - let ml = _mm256_add_epi64(ml, ml); - $C0 = _mm256_add_epi64($C0, _mm256_add_epi64($D0, ml)); - $B0 = _mm256_xor_si256($B0, $C0); - $B0 = rotr63!($B0); - let ml = _mm256_mul_epu32($A1, $B1); - let ml = _mm256_add_epi64(ml, ml); - $A1 = _mm256_add_epi64($A1, _mm256_add_epi64($B1, ml)); - $D1 = _mm256_xor_si256($D1, $A1); - $D1 = rotr16!($D1); - let ml = _mm256_mul_epu32($C1, $D1); - let ml = _mm256_add_epi64(ml, ml); - $C1 = _mm256_add_epi64($C1, _mm256_add_epi64($D1, ml)); - $B1 = _mm256_xor_si256($B1, $C1); - $B1 = rotr63!($B1); - }}; -} - -macro_rules! DIAGONALIZE_1 { - ($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{ - $B0 = _mm256_permute4x64_epi64($B0, _mm_shuffle2(0, 3, 2, 1)); - $C0 = _mm256_permute4x64_epi64($C0, _mm_shuffle2(1, 0, 3, 2)); - $D0 = _mm256_permute4x64_epi64($D0, _mm_shuffle2(2, 1, 0, 3)); - $B1 = _mm256_permute4x64_epi64($B1, _mm_shuffle2(0, 3, 2, 1)); - $C1 = _mm256_permute4x64_epi64($C1, _mm_shuffle2(1, 0, 3, 2)); - $D1 = _mm256_permute4x64_epi64($D1, _mm_shuffle2(2, 1, 0, 3)); - }}; -} - -macro_rules! DIAGONALIZE_2 { - ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ - let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC); - let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33); - $B1 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); - $B0 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); - let tmp1 = $C0; - $C0 = $C1; - $C1 = tmp1; - let tmp1 = _mm256_blend_epi32($D0, $D1, 0xCC); - let tmp2 = _mm256_blend_epi32($D0, $D1, 0x33); - $D0 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); - $D1 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); - }}; -} - -macro_rules! UNDIAGONALIZE_1 { - ($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{ - $B0 = _mm256_permute4x64_epi64($B0, _mm_shuffle2(2, 1, 0, 3)); - $C0 = _mm256_permute4x64_epi64($C0, _mm_shuffle2(1, 0, 3, 2)); - $D0 = _mm256_permute4x64_epi64($D0, _mm_shuffle2(0, 3, 2, 1)); - $B1 = _mm256_permute4x64_epi64($B1, _mm_shuffle2(2, 1, 0, 3)); - $C1 = _mm256_permute4x64_epi64($C1, _mm_shuffle2(1, 0, 3, 2)); - $D1 = _mm256_permute4x64_epi64($D1, _mm_shuffle2(0, 3, 2, 1)); - }}; -} - -macro_rules! UNDIAGONALIZE_2 { - ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ - let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC); - let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33); - $B0 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); - $B1 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); - let tmp1 = $C0; - $C0 = $C1; - $C1 = tmp1; - let tmp1 = _mm256_blend_epi32($D0, $D1, 0x33); - let tmp2 = _mm256_blend_epi32($D0, $D1, 0xCC); - $D0 = _mm256_permute4x64_epi64(tmp1, _mm_shuffle2(2, 3, 0, 1)); - $D1 = _mm256_permute4x64_epi64(tmp2, _mm_shuffle2(2, 3, 0, 1)); - }}; -} - -macro_rules! BLAKE2_ROUND_1 { - ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ - G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - DIAGONALIZE_1!($A0, $B0, $C0, $D0, $A1, $B1, $C1, $D1); - G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - UNDIAGONALIZE_1!($A0, $B0, $C0, $D0, $A1, $B1, $C1, $D1); - }}; -} - -macro_rules! BLAKE2_ROUND_2 { - ($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{ - G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - DIAGONALIZE_2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - UNDIAGONALIZE_2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1); - }}; -} - cpufeatures::new!(avx2_cpuid, "avx2"); /// Structure for the (1 KiB) memory block implemented as 128 64-bit words. @@ -283,97 +117,41 @@ impl Block { } #[cfg(any(target_arch = "x86_64"))] + #[target_feature(enable = "avx2")] unsafe fn compress_avx2(rhs: &Self, lhs: &Self) -> Self { - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - // one u64 is 64 bits, so 4 u64s is 256 bits - // 256 bits * 32 = 8192 bits = 1024 bytes - - // extract the data into 32 256-bit registers - - let mut state = [ - _mm256_loadu_si256(rhs.0.as_ptr().offset(0 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(1 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(2 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(3 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(4 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(5 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(6 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(7 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(8 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(9 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(10 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(11 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(12 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(13 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(14 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(15 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(16 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(17 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(18 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(19 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(20 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(21 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(22 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(23 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(24 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(25 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(26 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(27 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(28 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(29 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(30 * 4) as *const __m256i), - _mm256_loadu_si256(rhs.0.as_ptr().offset(31 * 4) as *const __m256i), - ]; - - // because there are only 32 YMM registers, we need to do the xor immedately after loading to get the compiler to emit ymmword ptr - - for i in 0..state.len() { - state[i] = _mm256_xor_si256( - state[i], - _mm256_loadu_si256(lhs.0.as_ptr().offset(i as isize * 4) as *const __m256i), - ); - } - - for i in 0..4 { - #[rustfmt::skip] - BLAKE2_ROUND_1!( - state[8 * i + 0], state[8 * i + 4], - state[8 * i + 1], state[8 * i + 5], - state[8 * i + 2], state[8 * i + 6], - state[8 * i + 3], state[8 * i + 7] - ); - } + let r = *rhs ^ lhs; - for i in 0..4 { + // Apply permutations rowwise + let mut q = r; + for chunk in q.0.chunks_exact_mut(16) { #[rustfmt::skip] - BLAKE2_ROUND_2!( - state[0 + i], state[4 + i], - state[8 + i], state[12 + i], - state[16 + i], state[20 + i], - state[24 + i], state[28 + i] + permute!( + chunk[0], chunk[1], chunk[2], chunk[3], + chunk[4], chunk[5], chunk[6], chunk[7], + chunk[8], chunk[9], chunk[10], chunk[11], + chunk[12], chunk[13], chunk[14], chunk[15], ); } - // xor registers - for i in 0..state.len() { - state[i] = _mm256_xor_si256( - state[i], - _mm256_loadu_si256(lhs.0.as_ptr().offset(i as isize * 4) as *const __m256i), - ); - } + // Apply permutations columnwise + for i in 0..8 { + let b = i * 2; - // reapply registers - let mut r = Self::new(); - for i in 0..state.len() { - _mm256_storeu_si256( - r.0.as_mut_ptr().offset(i as isize * 4) as *mut __m256i, - state[i], + #[rustfmt::skip] + permute!( + q.0[b], q.0[b + 1], + q.0[b + 16], q.0[b + 17], + q.0[b + 32], q.0[b + 33], + q.0[b + 48], q.0[b + 49], + q.0[b + 64], q.0[b + 65], + q.0[b + 80], q.0[b + 81], + q.0[b + 96], q.0[b + 97], + q.0[b + 112], q.0[b + 113], ); } - r + q ^= &r; + q } } From 7f5515a7e1d02ea56d759d2aed855e048417d63c Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Mon, 10 Jul 2023 08:09:02 -0400 Subject: [PATCH 09/12] argon2: adjust which target_arch uses cpufeatures --- argon2/Cargo.toml | 4 +++- argon2/src/block.rs | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/argon2/Cargo.toml b/argon2/Cargo.toml index db759ae0..32ae1f9c 100644 --- a/argon2/Cargo.toml +++ b/argon2/Cargo.toml @@ -18,12 +18,14 @@ rust-version = "1.65" [dependencies] base64ct = "1" blake2 = { version = "0.10.6", default-features = false } -cpufeatures = "0.2.9" # optional dependencies password-hash = { version = "0.5", optional = true } zeroize = { version = "1", default-features = false, optional = true } +[target.'cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))'.dependencies] +cpufeatures = "0.2.9" + [dev-dependencies] hex-literal = "0.4" password-hash = { version = "0.5", features = ["rand_core"] } diff --git a/argon2/src/block.rs b/argon2/src/block.rs index d5dd4c7c..881d096d 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -44,6 +44,7 @@ macro_rules! permute { }; } +#[cfg(any(target_arch = "x86_64"))] cpufeatures::new!(avx2_cpuid, "avx2"); /// Structure for the (1 KiB) memory block implemented as 128 64-bit words. From e204222261d44f51e95e5cd8195eecd9963baaa3 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Mon, 10 Jul 2023 12:48:21 -0400 Subject: [PATCH 10/12] rename `compress_safe` to `compress_soft` --- argon2/src/block.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index 881d096d..6bb663e2 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -78,10 +78,10 @@ impl Block { return unsafe { Self::compress_avx2(rhs, lhs) }; } } - Self::compress_safe(rhs, lhs) + Self::compress_soft(rhs, lhs) } - fn compress_safe(rhs: &Self, lhs: &Self) -> Self { + fn compress_soft(rhs: &Self, lhs: &Self) -> Self { let r = *rhs ^ lhs; // Apply permutations rowwise @@ -220,7 +220,7 @@ mod test { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); - let result = Block::compress_safe(&rhs, &lhs); + let result = Block::compress_soft(&rhs, &lhs); let result_av2 = unsafe { Block::compress_avx2(&rhs, &lhs) }; assert_eq!(result, result_av2); From 3faaff1e170d8c8a0169c3f52cc65d5caab18751 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Mon, 10 Jul 2023 13:03:10 -0400 Subject: [PATCH 11/12] fix minor requested code changes --- argon2/Cargo.toml | 2 +- argon2/src/block.rs | 66 ++++++++------------------------------------- 2 files changed, 12 insertions(+), 56 deletions(-) diff --git a/argon2/Cargo.toml b/argon2/Cargo.toml index 32ae1f9c..ba406625 100644 --- a/argon2/Cargo.toml +++ b/argon2/Cargo.toml @@ -23,7 +23,7 @@ blake2 = { version = "0.10.6", default-features = false } password-hash = { version = "0.5", optional = true } zeroize = { version = "1", default-features = false, optional = true } -[target.'cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))'.dependencies] +[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] cpufeatures = "0.2.9" [dev-dependencies] diff --git a/argon2/src/block.rs b/argon2/src/block.rs index 6bb663e2..bec3bb83 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -44,12 +44,11 @@ macro_rules! permute { }; } -#[cfg(any(target_arch = "x86_64"))] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] cpufeatures::new!(avx2_cpuid, "avx2"); /// Structure for the (1 KiB) memory block implemented as 128 64-bit words. #[derive(Copy, Clone, Debug)] -#[cfg_attr(test, derive(PartialEq))] #[repr(align(64))] pub struct Block([u64; Self::SIZE / 8]); @@ -71,16 +70,16 @@ impl Block { } pub(crate) fn compress(rhs: &Self, lhs: &Self) -> Self { - #[cfg(any(target_arch = "x86_64"))] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - let (_, avx2) = avx2_cpuid::init_get(); - if avx2 { + if avx2_cpuid::get() { return unsafe { Self::compress_avx2(rhs, lhs) }; } } Self::compress_soft(rhs, lhs) } + #[inline(always)] fn compress_soft(rhs: &Self, lhs: &Self) -> Self { let r = *rhs ^ lhs; @@ -117,42 +116,10 @@ impl Block { q } - #[cfg(any(target_arch = "x86_64"))] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] unsafe fn compress_avx2(rhs: &Self, lhs: &Self) -> Self { - let r = *rhs ^ lhs; - - // Apply permutations rowwise - let mut q = r; - for chunk in q.0.chunks_exact_mut(16) { - #[rustfmt::skip] - permute!( - chunk[0], chunk[1], chunk[2], chunk[3], - chunk[4], chunk[5], chunk[6], chunk[7], - chunk[8], chunk[9], chunk[10], chunk[11], - chunk[12], chunk[13], chunk[14], chunk[15], - ); - } - - // Apply permutations columnwise - for i in 0..8 { - let b = i * 2; - - #[rustfmt::skip] - permute!( - q.0[b], q.0[b + 1], - q.0[b + 16], q.0[b + 17], - q.0[b + 32], q.0[b + 33], - q.0[b + 48], q.0[b + 49], - q.0[b + 64], q.0[b + 65], - q.0[b + 80], q.0[b + 81], - q.0[b + 96], q.0[b + 97], - q.0[b + 112], q.0[b + 113], - ); - } - - q ^= &r; - q + Self::compress_soft(rhs, lhs) } } @@ -205,24 +172,13 @@ mod test { #[cfg(target_arch = "x86_64")] #[test] fn compress_avx2() { - let lhs = Block([ - 0, 0, 0, 2048, 4, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]); - let rhs = Block([ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]); + let mut lhs = Block([0; 128]); + lhs.0[0..7].copy_from_slice(&[0, 0, 0, 2048, 4, 2, 1]); + let rhs = Block([0; 128]); let result = Block::compress_soft(&rhs, &lhs); - let result_av2 = unsafe { Block::compress_avx2(&rhs, &lhs) }; + let result_avx2 = unsafe { Block::compress_avx2(&rhs, &lhs) }; - assert_eq!(result, result_av2); + assert_eq!(result.0, result_avx2.0); } } From 48f3c0ac985e727a47f2e7f590461bfb1f57c24b Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Mon, 10 Jul 2023 14:16:42 -0400 Subject: [PATCH 12/12] refactor to have the argon2 struct hold InitToken --- argon2/src/block.rs | 17 ++--------------- argon2/src/lib.rs | 33 ++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/argon2/src/block.rs b/argon2/src/block.rs index bec3bb83..7b471c9d 100644 --- a/argon2/src/block.rs +++ b/argon2/src/block.rs @@ -44,9 +44,6 @@ macro_rules! permute { }; } -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -cpufeatures::new!(avx2_cpuid, "avx2"); - /// Structure for the (1 KiB) memory block implemented as 128 64-bit words. #[derive(Copy, Clone, Debug)] #[repr(align(64))] @@ -69,18 +66,8 @@ impl Block { unsafe { &mut *(self.0.as_mut_ptr() as *mut [u8; Self::SIZE]) } } - pub(crate) fn compress(rhs: &Self, lhs: &Self) -> Self { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - if avx2_cpuid::get() { - return unsafe { Self::compress_avx2(rhs, lhs) }; - } - } - Self::compress_soft(rhs, lhs) - } - #[inline(always)] - fn compress_soft(rhs: &Self, lhs: &Self) -> Self { + pub(crate) fn compress_soft(rhs: &Self, lhs: &Self) -> Self { let r = *rhs ^ lhs; // Apply permutations rowwise @@ -118,7 +105,7 @@ impl Block { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] - unsafe fn compress_avx2(rhs: &Self, lhs: &Self) -> Self { + pub(crate) unsafe fn compress_avx2(rhs: &Self, lhs: &Self) -> Self { Self::compress_soft(rhs, lhs) } } diff --git a/argon2/src/lib.rs b/argon2/src/lib.rs index 41b9f8c7..eead9b0d 100644 --- a/argon2/src/lib.rs +++ b/argon2/src/lib.rs @@ -144,6 +144,9 @@ pub(crate) const SYNC_POINTS: usize = 4; /// To generate reference block positions const ADDRESSES_IN_BLOCK: usize = 128; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +cpufeatures::new!(avx2_cpuid, "avx2"); + /// Argon2 context. /// /// This is the primary type of this crate's API, and contains the following: @@ -165,6 +168,9 @@ pub struct Argon2<'key> { /// Key array secret: Option<&'key [u8]>, + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + cpu_feat_avx2: avx2_cpuid::InitToken, } impl Default for Argon2<'_> { @@ -191,6 +197,8 @@ impl<'key> Argon2<'key> { version, params, secret: None, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + cpu_feat_avx2: avx2_cpuid::init(), } } @@ -210,6 +218,8 @@ impl<'key> Argon2<'key> { version, params, secret: Some(secret), + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + cpu_feat_avx2: avx2_cpuid::init(), }) } @@ -335,7 +345,7 @@ impl<'key> Argon2<'key> { let first_block = if pass == 0 && slice == 0 { if data_independent_addressing { // Generate first set of addresses - Self::update_address_block( + self.update_address_block( &mut address_block, &mut input_block, &zero_block, @@ -364,7 +374,7 @@ impl<'key> Argon2<'key> { let addres_index = block % ADDRESSES_IN_BLOCK; if addres_index == 0 { - Self::update_address_block( + self.update_address_block( &mut address_block, &mut input_block, &zero_block, @@ -424,7 +434,7 @@ impl<'key> Argon2<'key> { // Calculate new block let result = - Block::compress(&memory_blocks[prev_index], &memory_blocks[ref_index]); + self.compress(&memory_blocks[prev_index], &memory_blocks[ref_index]); if self.version == Version::V0x10 || pass == 0 { memory_blocks[cur_index] = result; @@ -442,6 +452,16 @@ impl<'key> Argon2<'key> { Ok(()) } + fn compress(&self, rhs: &Block, lhs: &Block) -> Block { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if self.cpu_feat_avx2.get() { + return unsafe { Block::compress_avx2(rhs, lhs) }; + } + } + Block::compress_soft(rhs, lhs) + } + /// Get default configured [`Params`]. pub fn params(&self) -> &Params { &self.params @@ -467,13 +487,14 @@ impl<'key> Argon2<'key> { } fn update_address_block( + &self, address_block: &mut Block, input_block: &mut Block, zero_block: &Block, ) { input_block.as_mut()[6] += 1; - *address_block = Block::compress(zero_block, input_block); - *address_block = Block::compress(zero_block, address_block); + *address_block = self.compress(zero_block, input_block); + *address_block = self.compress(zero_block, address_block); } /// Hashes all the inputs into `blockhash[PREHASH_DIGEST_LEN]`. @@ -579,6 +600,8 @@ impl PasswordHasher for Argon2<'_> { algorithm, version, params, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + cpu_feat_avx2: self.cpu_feat_avx2, } .hash_password(password, salt) }