From e1cb82109cd69826e6e89ef4d9c1a568bd1535fd Mon Sep 17 00:00:00 2001 From: Tony Arcieri Date: Mon, 20 Jan 2020 12:13:37 -0800 Subject: [PATCH] [WIP] poly1305: AVX2 based on Bhattacharyya/Sarkar paper Mechanical translation originally based on Corrode. The original source code seems to be incomplete, but still amenable to a mechanical translation: https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305 Notably its internal buffering in the original source code is commented out, but not part of the `UniversalHash`-trait API. We could potentially throw away all of the optimizations for larger-than-block-size buffers and the incomplete code for buffering non-block-aligned messages to get a baseline AVX2-optimized implementation of Poly1305. That said, this code is failing the basic test vectors, probably owing to its incompleteness, and it probably makes more sense to base an AVX2 implementation on well-tested code which may perform slower but is known to work and be generally robust. --- poly1305/src/avx2.rs | 1305 ++++++++++++++++++++++++++++++++++++++++++ poly1305/src/lib.rs | 348 +---------- poly1305/src/soft.rs | 326 +++++++++++ 3 files changed, 1656 insertions(+), 323 deletions(-) create mode 100644 poly1305/src/avx2.rs create mode 100644 poly1305/src/soft.rs diff --git a/poly1305/src/avx2.rs b/poly1305/src/avx2.rs new file mode 100644 index 0000000..220b129 --- /dev/null +++ b/poly1305/src/avx2.rs @@ -0,0 +1,1305 @@ +//! The Poly1305 universal hash function (AVX2 optimized). +//! +//! Adapted from: "Improved SIMD Implementation of Poly1305" which is based +//! on the SIMD Poly1305 developed by Shay Gueron and Martin Goll. +//! Copyright (c) 2019, Sreyosi Bhattacharyya, Palash Sarkar +//! + +#![allow(non_camel_case_types, unused_parens, unused_variables, unused_assignments, non_snake_case)] + +use super::Tag; +use core::{mem, ptr}; +use universal_hash::{ + generic_array::{ + typenum::{U16, U32}, + GenericArray, + }, + UniversalHash, +}; + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +type vec128 = __m128i; +type vec256 = __m256i; + +#[derive(Copy, Clone)] +struct vec256x2 { + v0: vec256, + v1: vec256, +} + +#[derive(Clone)] +struct vec256x3 { + v0: vec256, + v1: vec256, + v2: vec256, +} + +#[derive(Clone)] +struct vec256x5 { + v0: vec256, + v1: vec256, + v2: vec256, + v3: vec256, + v4: vec256, +} + +//enum Buffer { +// B([u8; 64]), +// V(vec256x2), +// VA([vec128; 4]), +//} + +/// Size of the internal buffer +const BUFFER_SIZE: usize = 64; + +/// Internal buffer +type Buffer = [u8; 64]; + +/// The Poly1305 universal hash function (AVX2 optimized) +#[derive(Clone)] +pub struct Poly1305 { + k: vec256, + r1: vec256, + r2: vec256, + r4: vec256, + r15: vec256, + r25: vec256, + r45: vec256, + m: vec256x2, + p: vec256x3, + buffer: Buffer, + p_init: u32, + leftover: u32, +} + +impl UniversalHash for Poly1305 { + type KeySize = U32; + type BlockSize = U16; + + /// Initialize Poly1305 with the given key + fn new(key: &GenericArray) -> Poly1305 { + let mut state: Poly1305 = unsafe { mem::zeroed() }; + unsafe { + init(&mut state, key); + } + state + } + + /// Input data into the Poly1305 universal hash function + fn update_block(&mut self, block: &GenericArray) { + self.update(block.as_slice()); + } + + /// Reset internal state + fn reset(&mut self) { + // TODO(tarcieri): call `init` again but without key + unimplemented!(); + } + + /// Get the hashed output + fn result(mut self) -> Tag { + let mut tag = GenericArray::default(); + unsafe { + finish(&mut self, tag.as_mut_slice()); + } + Tag::new(tag) + } +} + +impl Poly1305 { + /// Input data into the Poly1305 universal hash function + pub fn update(&mut self, data: &[u8]) { + unsafe { process(self, data); } + } + + /// Process input messages in a chained manner + pub fn chain(mut self, data: &[u8]) -> Self { + self.update(data); + self + } +} + +#[inline] +#[target_feature(enable = "avx2")] +pub unsafe fn init(state: &mut Poly1305, key: &GenericArray) { + let k = _mm256_loadu_si256(key.as_ptr() as *const __m256i); + state.k = _mm256_and_si256( + (_mm256_permutevar8x32_epi32((k), (_mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)))), + (_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)), + ); + + let mut r1 = _mm256_and_si256( + (_mm256_or_si256( + (_mm256_sllv_epi32( + (_mm256_and_si256( + (k), + (_mm256_set_epi32(0, 0, 0, 0, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, 0x0fffffff)), + )), + (_mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0)), + )), + (_mm256_permutevar8x32_epi32( + (_mm256_srlv_epi32( + (_mm256_and_si256( + (k), + (_mm256_set_epi32( + 0, 0, 0, 0, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, 0x0fffffff, + )), + )), + (_mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26)), + )), + (_mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7)), + )), + )), + (_mm256_set_epi32( + 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, + )), + ); + let mut r15 = _mm256_permutevar8x32_epi32( + (_mm256_add_epi32((r1), (_mm256_slli_epi32((r1), (2))))), + (_mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1)), + ); + r1 = _mm256_blend_epi32((r1), (r15), (0xE0)); + r15 = _mm256_permute2x128_si256((r15), (r15), (0)); + + let mut r2 = red5x64(mul130(r1, r1, r15)); + let mut r25 = _mm256_permutevar8x32_epi32( + (_mm256_add_epi32((r2), (_mm256_slli_epi32((r2), (2))))), + (_mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1)), + ); + r2 = _mm256_blend_epi32((r2), (r25), (0xE0)); + r25 = _mm256_permute2x128_si256((r25), (r25), (0)); + + state.r1 = r1; + state.r2 = r2; + state.r15 = r15; + state.r25 = r25; + state.p_init = 0; + state.leftover = 0; +} + +#[inline] +#[target_feature(enable = "avx2")] +pub unsafe fn process(state: &mut Poly1305, input: &[u8]) { + let mut in_len = input.len(); + let mut ip: *const u32 = input.as_ptr() as *const u32; + let mut p: vec256x3; + + let r1 = state.r1; + let r15 = state.r15; + let r2 = state.r2; + let r25 = state.r25; + let r3 = red5x64(mul130(r2, r1, r15)); + let r4 = red5x64(mul130(r2, r2, r25)); + + state.m.v0 = _mm256_blend_epi32( + r3, + _mm256_permutevar8x32_epi32(r2, _mm256_set_epi32(4, 3, 1, 0, 0, 0, 0, 0)), + 0xE0, + ); + + state.m.v1 = _mm256_blend_epi32( + r4, + _mm256_permutevar8x32_epi32(r2, _mm256_set_epi32(4, 2, 0, 0, 0, 0, 0, 0)), + 0xE0, + ); + + let r45 = _mm256_permutevar8x32_epi32( + _mm256_add_epi32(r4, _mm256_slli_epi32(r4, 2)), + _mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1), + ); + state.r4 = _mm256_blend_epi32(r4, r45, 0xE0); + state.r45 = _mm256_permute2x128_si256(r45, r45, 0); + p = align4x128(load4x128(ip)); + in_len -= 64; + ip = ip.add(16); + + state.p_init = 1; + + if in_len >= 768 { + let mut r8 = red5x64(mul130(state.r4, state.r4, state.r45)); + let mut r12 = red5x64(mul130(r8, state.r4, state.r45)); + let mut r85 = _mm256_permutevar8x32_epi32( + (_mm256_add_epi32((r8), (_mm256_slli_epi32((r8), (2))))), + (_mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1)), + ); + let mut r125 = _mm256_permutevar8x32_epi32( + (_mm256_add_epi32((r12), (_mm256_slli_epi32((r12), (2))))), + (_mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1)), + ); + r8 = _mm256_blend_epi32((r8), (r85), (0xE0)); + r12 = _mm256_blend_epi32((r12), (r125), (0xE0)); + r85 = _mm256_permute2x128_si256((r85), (r85), (0)); + r125 = _mm256_permute2x128_si256((r125), (r125), (0)); + + loop { + p = add4x130( + red4x130(muladd4x130P( + muladd4x130P(mul4x130P(p, r12, r125), align4x128(load4x128(ip)), r8, r85), + align4x128(load4x128(ip.add(16))), + state.r4, + state.r45, + )), + align4x128(load4x128(ip.add(32))), + ); + in_len -= 192; + ip = ip.add(48); + if in_len < 192 { + break; + } + } + + if in_len >= 128 { + p = add4x130( + red4x130(muladd4x130P( + mul4x130P(p, r8, r85), + align4x128(load4x128(ip)), + state.r4, + state.r45, + )), + align4x128(load4x128(ip.add(16))), + ); + in_len -= 128; + ip = ip.add(32); + } else if in_len >= 64 { + p = add4x130( + red4x130(mul4x130P(p, state.r4, state.r45)), + align4x128(load4x128(ip)), + ); + in_len -= 64; + ip = ip.add(16); + } + } else if in_len >= 64 { + loop { + p = add4x130( + red4x130(mul4x130R(p, state.r4, state.r45)), + align4x128(load4x128(ip)), + ); + in_len -= 64; + ip = ip.add(16); + if in_len < 64 { + break; + } + } + } +} + +#[inline] +#[target_feature(enable = "avx2")] +pub unsafe fn finish(state: &mut Poly1305, output: &mut [u8]) { + let mut p_init: u32 = state.p_init; + let mut p = _mm256_set_epi64x(0, 0, 0, 0); + let mut buf_len = state.leftover; + let mut idx = 0; + + if buf_len >= 32 { + let c = vec256x2 { + v0: _mm256_and_si256( + _mm256_or_si256( + _mm256_sllv_epi32( + _mm256_or_si256( + _mm256_castsi128_si256(state.buffer.va()[0]), + _mm256_set_epi64x(0, 1, 0, 0), + ), + _mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0), + ), + _mm256_permutevar8x32_epi32( + _mm256_srlv_epi32( + _mm256_or_si256( + _mm256_castsi128_si256(state.buffer.va()[0]), + _mm256_set_epi64x(0, 1, 0, 0), + ), + _mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26), + ), + _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7), + ), + ), + _mm256_set_epi32( + 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, + ), + ), + v1: _mm256_and_si256( + _mm256_or_si256( + _mm256_sllv_epi32( + _mm256_or_si256( + _mm256_castsi128_si256(state.buffer.va()[1]), + _mm256_set_epi64x(0, 1, 0, 0), + ), + _mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0), + ), + _mm256_permutevar8x32_epi32( + _mm256_srlv_epi32( + _mm256_or_si256( + _mm256_castsi128_si256(state.buffer.va()[1]), + _mm256_set_epi64x(0, 1, 0, 0), + ), + _mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26), + ), + _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7), + ), + ), + _mm256_set_epi32( + 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, + ), + ), + }; + + let p = red5x64(mul2x130(c, state.r1, state.r2, state.r15, state.r25)); + idx += 2; + buf_len -= 32; + p_init += 1; + } + + if buf_len >= 16 { + let mut c: vec256 = _mm256_and_si256( + _mm256_or_si256( + _mm256_sllv_epi32( + _mm256_or_si256( + _mm256_castsi128_si256(state.buffer.va()[idx]), + _mm256_set_epi64x(0, 1, 0, 0), + ), + _mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0), + ), + _mm256_permutevar8x32_epi32( + _mm256_srlv_epi32( + _mm256_or_si256( + _mm256_castsi128_si256(state.buffer.va()[idx]), + _mm256_set_epi64x(0, 1, 0, 0), + ), + _mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26), + ), + _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7), + ), + ), + _mm256_set_epi32( + 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, + ), + ); + + if p_init != 0 { + c = _mm256_add_epi32(p, c); + } + + let p = red5x64(mul130(c, state.r1, state.r15)); + idx += 1; + buf_len -= 16; + p_init += 1; + } + + if buf_len != 0 { + state.buffer.b()[state.leftover as usize] = 1; + if buf_len < 15 { + memzero15( + state + .buffer + .b() + .as_mut_ptr() + .add(state.leftover.checked_add(1).unwrap() as usize), + 15 - buf_len as usize, + ); + } + + let mut c = _mm256_and_si256( + _mm256_or_si256( + _mm256_sllv_epi32( + _mm256_castsi128_si256(state.buffer.va()[idx]), + _mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0), + ), + _mm256_permutevar8x32_epi32( + _mm256_srlv_epi32( + _mm256_castsi128_si256(state.buffer.va()[idx]), + _mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26), + ), + _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7), + ), + ), + _mm256_set_epi32( + 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, + ), + ); + + if p_init != 0 { + c = _mm256_add_epi32(p, c); + } + + p = red5x64(mul130(c, state.r1, state.r15)); + p_init += 1; + } + + if p_init != 0 { + _mm_storeu_si128(output.as_mut_ptr() as *mut __m128i, addkey(p, state.k)); + } else { + _mm_storeu_si128( + output.as_mut_ptr() as *mut __m128i, + _mm256_castsi256_si128(_mm256_permutevar8x32_epi32( + state.k, + _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0), + )), + ); + } +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn addkey(mut x: vec256, k: vec256) -> vec128 { + let t = _mm256_permutevar8x32_epi32( + (_mm256_srli_epi32((x), (26))), + (_mm256_set_epi32(7, 7, 7, 3, 2, 1, 0, 4)), + ); + x = _mm256_add_epi32( + (_mm256_add_epi32( + (_mm256_and_si256( + (x), + (_mm256_set_epi32( + 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, + )), + )), + (t), + )), + (_mm256_permutevar8x32_epi32( + (_mm256_slli_epi32((t), (2))), + (_mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 0)), + )), + ); + x = _mm256_or_si256( + (_mm256_srlv_epi32((x), (_mm256_set_epi32(32, 32, 32, 32, 18, 12, 6, 0)))), + (_mm256_permutevar8x32_epi32( + (_mm256_sllv_epi32((x), (_mm256_set_epi32(32, 32, 32, 8, 14, 20, 26, 32)))), + (_mm256_set_epi32(7, 7, 7, 7, 4, 3, 2, 1)), + )), + ); + x = _mm256_add_epi64( + (_mm256_permutevar8x32_epi32((x), (_mm256_set_epi32(7, 3, 7, 2, 7, 1, 7, 0)))), + (k), + ); + x = _mm256_add_epi64( + (_mm256_permutevar8x32_epi32((x), (_mm256_set_epi32(7, 7, 7, 7, 6, 4, 2, 0)))), + (_mm256_permutevar8x32_epi32( + (_mm256_srli_epi64((x), (32))), + (_mm256_set_epi32(7, 7, 7, 7, 4, 2, 0, 7)), + )), + ); + _mm256_castsi256_si128((x)) +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn mul2x130(x: vec256x2, r1: vec256, r2: vec256, r15: vec256, r25: vec256) -> vec256x2 { + let mut ret: vec256x2 = mem::zeroed(); + ret.v0 = _mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v0), (_mm256_set_epi64x(4, 3, 2, 1)))), + (_mm256_permutevar8x32_epi32((r2), (_mm256_set_epi64x(7, 7, 7, 7)))), + ); + ret.v1 = _mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v1), (_mm256_set_epi64x(4, 3, 2, 1)))), + (_mm256_permutevar8x32_epi32((r1), (_mm256_set_epi64x(7, 7, 7, 7)))), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (_mm256_permute4x64_epi64((x.v0), (((0) << 6) | ((2) << 4) | ((2) << 2) | (1)))), + (_mm256_permutevar8x32_epi32((r2), (_mm256_set_epi64x(3, 6, 5, 6)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (_mm256_permute4x64_epi64((x.v1), (((0) << 6) | ((2) << 4) | ((2) << 2) | (1)))), + (_mm256_permutevar8x32_epi32((r1), (_mm256_set_epi64x(3, 6, 5, 6)))), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v0), (_mm256_set_epi64x(1, 1, 3, 3)))), + (_mm256_permutevar8x32_epi32((r2), (_mm256_set_epi64x(2, 1, 6, 5)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v1), (_mm256_set_epi64x(1, 1, 3, 3)))), + (_mm256_permutevar8x32_epi32((r1), (_mm256_set_epi64x(2, 1, 6, 5)))), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v0), (_mm256_set_epi64x(3, 2, 1, 0)))), + (_mm256_broadcastd_epi32((_mm256_castsi256_si128((r2))))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v1), (_mm256_set_epi64x(3, 2, 1, 0)))), + (_mm256_broadcastd_epi32((_mm256_castsi256_si128((r1))))), + )), + ); + + let mut t0 = _mm256_permute4x64_epi64((x.v0), (((1) << 6) | ((0) << 4) | ((0) << 2) | (2))); + let mut t1 = _mm256_permute4x64_epi64((x.v1), (((1) << 6) | ((0) << 4) | ((0) << 2) | (2))); + + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (t0), + (_mm256_blend_epi32( + (_mm256_permutevar8x32_epi32((r2), (_mm256_set_epi64x(1, 2, 1, 1)))), + (r25), + (0x03), + )), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (t1), + (_mm256_blend_epi32( + (_mm256_permutevar8x32_epi32((r1), (_mm256_set_epi64x(1, 2, 1, 1)))), + (r15), + (0x03), + )), + )), + ); + ret.v0 = _mm256_add_epi64((ret.v0), (ret.v1)); + t0 = _mm256_mul_epu32((t0), (r2)); + t1 = _mm256_mul_epu32((t1), (r1)); + ret.v1 = _mm256_add_epi64((t0), (t1)); + t0 = _mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v0), (_mm256_set_epi64x(3, 2, 1, 0)))), + (_mm256_permutevar8x32_epi32((r2), (_mm256_set_epi64x(1, 2, 3, 4)))), + ); + t1 = _mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x.v1), (_mm256_set_epi64x(3, 2, 1, 0)))), + (_mm256_permutevar8x32_epi32((r1), (_mm256_set_epi64x(1, 2, 3, 4)))), + ); + t0 = _mm256_add_epi64((t0), (t1)); + t0 = _mm256_add_epi64( + (t0), + (_mm256_permute4x64_epi64((t0), (((1) << 6) | ((0) << 4) | ((3) << 2) | (2)))), + ); + t0 = _mm256_add_epi64( + (t0), + (_mm256_permute4x64_epi64((t0), (((2) << 6) | ((3) << 4) | ((0) << 2) | (1)))), + ); + ret.v1 = _mm256_add_epi64((ret.v1), (t0)); + ret +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn red5x64(mut x: vec256x2) -> vec256 { + x.v0 = _mm256_add_epi64( + (_mm256_and_si256( + (x.v0), + (_mm256_set_epi64x(-1, 0x3ffffff, 0x3ffffff, 0x3ffffff)), + )), + (_mm256_permute4x64_epi64( + (_mm256_srlv_epi64((x.v0), (_mm256_set_epi64x(64, 26, 26, 26)))), + (((2) << 6) | ((1) << 4) | ((0) << 2) | (3)), + )), + ); + x.v1 = _mm256_add_epi64( + (x.v1), + (_mm256_permute4x64_epi64( + (_mm256_srli_epi64((x.v0), (26))), + (((2) << 6) | ((1) << 4) | ((0) << 2) | (3)), + )), + ); + x.v0 = _mm256_and_si256((x.v0), (_mm256_set_epi64x(0x3ffffff, -1, -1, -1))); + + let t = _mm256_srlv_epi64((x.v1), (_mm256_set_epi64x(64, 64, 64, 26))); + x.v0 = _mm256_add_epi64( + (_mm256_add_epi64((x.v0), (t))), + (_mm256_slli_epi32((t), (2))), + ); + x.v1 = _mm256_and_si256((x.v1), (_mm256_set_epi64x(0, 0, 0, 0x3ffffff))); + x.v0 = _mm256_add_epi64( + (_mm256_and_si256( + (x.v0), + (_mm256_set_epi64x(-1, 0x3ffffff, 0x3ffffff, 0x3ffffff)), + )), + (_mm256_permute4x64_epi64( + (_mm256_srlv_epi64((x.v0), (_mm256_set_epi64x(64, 26, 26, 26)))), + (((2) << 6) | ((1) << 4) | ((0) << 2) | (3)), + )), + ); + x.v1 = _mm256_add_epi64( + (x.v1), + (_mm256_permute4x64_epi64( + (_mm256_srli_epi64((x.v0), (26))), + (((2) << 6) | ((1) << 4) | ((0) << 2) | (3)), + )), + ); + x.v0 = _mm256_and_si256((x.v0), (_mm256_set_epi64x(0x3ffffff, -1, -1, -1))); + + _mm256_blend_epi32( + (_mm256_permutevar8x32_epi32((x.v0), (_mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)))), + (_mm256_permutevar8x32_epi32((x.v1), (_mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)))), + (0x90), + ) +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn mul130(x: vec256, y: vec256, z: vec256) -> vec256x2 { + let mut ret: vec256x2 = mem::zeroed(); + + ret.v0 = _mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x), (_mm256_set_epi64x(4, 3, 2, 1)))), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x), (_mm256_set_epi64x(3, 2, 1, 0)))), + (_mm256_broadcastd_epi32((_mm256_castsi256_si128((y))))), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x), (_mm256_set_epi64x(1, 1, 3, 3)))), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(2, 1, 6, 5)))), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (_mm256_permute4x64_epi64((x), (((1) << 6) | ((0) << 4) | ((0) << 2) | (2)))), + (_mm256_blend_epi32( + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 2, 1, 1)))), + (z), + (0x03), + )), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (_mm256_permute4x64_epi64((x), (((0) << 6) | ((2) << 4) | ((2) << 2) | (1)))), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(3, 6, 5, 6)))), + )), + ); + + ret.v1 = _mm256_mul_epu32( + (_mm256_permutevar8x32_epi32((x), (_mm256_set_epi64x(3, 2, 1, 0)))), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 2, 3, 4)))), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_permute4x64_epi64((ret.v1), (((1) << 6) | ((0) << 4) | ((3) << 2) | (2)))), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_permute4x64_epi64((ret.v1), (((0) << 6) | ((0) << 4) | ((0) << 2) | (1)))), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (_mm256_permute4x64_epi64((x), (((0) << 6) | ((0) << 4) | ((0) << 2) | (2)))), + (y), + )), + ); + + ret +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn align4x128(mut x: vec256x2) -> vec256x3 { + let msk: vec256 = _mm256_setr_epi32( + 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, + ); + let pad: vec256 = _mm256_setr_epi32( + 1 << 24, + 1 << 24, + 1 << 24, + 1 << 24, + 1 << 24, + 1 << 24, + 1 << 24, + 1 << 24, + ); + + let mut ret: vec256x3 = mem::zeroed(); + ret.v0 = _mm256_permute4x64_epi64( + (_mm256_unpackhi_epi64((x.v0), (x.v1))), + (((3) << 6) | ((1) << 4) | ((2) << 2) | (0)), + ); + x.v0 = _mm256_permute4x64_epi64( + (_mm256_unpacklo_epi64((x.v0), (x.v1))), + (((3) << 6) | ((1) << 4) | ((2) << 2) | (0)), + ); + ret.v2 = _mm256_or_si256((_mm256_srli_epi64((ret.v0), (40))), (pad)); + x.v1 = _mm256_or_si256( + (_mm256_srli_epi64((x.v0), (46))), + (_mm256_slli_epi64((ret.v0), (18))), + ); + ret.v1 = _mm256_and_si256( + (_mm256_blend_epi32((_mm256_srli_epi64((x.v0), (26))), (x.v1), (0xAA))), + (msk), + ); + ret.v0 = _mm256_and_si256( + (_mm256_blend_epi32((x.v0), (_mm256_slli_epi64((x.v1), (26))), (0xAA))), + (msk), + ); + ret +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn load4x128(ip: *const u32) -> vec256x2 { + vec256x2 { + v0: _mm256_loadu_si256(ip as *const __m256i), + v1: _mm256_loadu_si256(ip.add(8) as *const __m256i), + } +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn add4x130(x: vec256x3, y: vec256x3) -> vec256x3 { + vec256x3 { + v0: _mm256_add_epi32((x.v0), (y.v0)), + v1: _mm256_add_epi32((x.v1), (y.v1)), + v2: _mm256_add_epi32((x.v2), (y.v2)), + } +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn hadd4x130(x: vec256x5) -> vec256x2 { + let mut ret: vec256x2 = mem::zeroed(); + ret.v0 = _mm256_add_epi64( + (_mm256_unpackhi_epi64((x.v0), (x.v1))), + (_mm256_unpacklo_epi64((x.v0), (x.v1))), + ); + ret.v1 = _mm256_add_epi64( + (_mm256_unpackhi_epi64((x.v2), (x.v3))), + (_mm256_unpacklo_epi64((x.v2), (x.v3))), + ); + ret.v0 = _mm256_add_epi64( + (_mm256_inserti128_si256((ret.v0), (_mm256_castsi256_si128((ret.v1))), (1))), + (_mm256_inserti128_si256((ret.v1), (_mm256_extractf128_si256((ret.v0), (1))), (0))), + ); + ret.v1 = _mm256_add_epi64( + (x.v4), + (_mm256_permute4x64_epi64((x.v4), (((1) << 6) | ((0) << 4) | ((3) << 2) | (2)))), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_permute4x64_epi64((ret.v1), (((0) << 6) | ((0) << 4) | ((0) << 2) | (1)))), + ); + ret +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn red4x130(mut x: vec256x5) -> vec256x3 { + let msk: vec256 = _mm256_setr_epi32(0x3ffffff, 0, 0x3ffffff, 0, 0x3ffffff, 0, 0x3ffffff, 0); + + x.v1 = _mm256_add_epi64((x.v1), (_mm256_srli_epi64((x.v0), (26)))); + x.v0 = _mm256_and_si256((x.v0), (msk)); + x.v4 = _mm256_add_epi64((x.v4), (_mm256_srli_epi64((x.v3), (26)))); + x.v3 = _mm256_and_si256((x.v3), (msk)); + x.v2 = _mm256_add_epi64((x.v2), (_mm256_srli_epi64((x.v1), (26)))); + x.v1 = _mm256_and_si256((x.v1), (msk)); + x.v0 = _mm256_add_epi64( + (x.v0), + (_mm256_mul_epu32( + (_mm256_srli_epi64((x.v4), (26))), + _mm256_setr_epi32(5, 0, 5, 0, 5, 0, 5, 0), + )), + ); + x.v4 = _mm256_and_si256((x.v4), (msk)); + x.v3 = _mm256_add_epi64((x.v3), (_mm256_srli_epi64((x.v2), (26)))); + x.v2 = _mm256_and_si256((x.v2), (msk)); + x.v1 = _mm256_add_epi64((x.v1), (_mm256_srli_epi64((x.v0), (26)))); + x.v0 = _mm256_and_si256((x.v0), (msk)); + x.v4 = _mm256_add_epi64((x.v4), (_mm256_srli_epi64((x.v3), (26)))); + x.v3 = _mm256_and_si256((x.v3), (msk)); + x.v0 = _mm256_blend_epi32((x.v0), (_mm256_slli_epi64((x.v2), (32))), (0xAA)); + x.v1 = _mm256_blend_epi32((x.v1), (_mm256_slli_epi64((x.v3), (32))), (0xAA)); + + vec256x3 { + v0: x.v0, + v1: x.v1, + v2: x.v4, + } +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn mul4x130M(mut x: vec256x3, mut m: vec256x2, r1: vec256) -> vec256x5 { + let mut ret: vec256x5 = mem::zeroed(); + + ret.v0 = _mm256_unpacklo_epi32((m.v0), (m.v1)); + ret.v1 = _mm256_unpackhi_epi32((m.v0), (m.v1)); + + let mut t: vec256x3 = mem::zeroed(); + + let mut ord = _mm256_set_epi32(1, 0, 6, 7, 2, 0, 3, 1); + t.v0 = _mm256_blend_epi32( + (_mm256_permutevar8x32_epi32((r1), (ord))), + (_mm256_permutevar8x32_epi32((ret.v0), (ord))), + (0x3F), + ); + + ord = _mm256_set_epi32(3, 2, 4, 5, 2, 0, 3, 1); + t.v1 = _mm256_blend_epi32( + (_mm256_permutevar8x32_epi32((r1), (ord))), + (_mm256_permutevar8x32_epi32((ret.v1), (ord))), + (0x3F), + ); + + ord = _mm256_set_epi32(1, 4, 6, 6, 2, 4, 3, 5); + t.v2 = _mm256_blend_epi32( + (_mm256_blend_epi32( + (_mm256_permutevar8x32_epi32((r1), (ord))), + (_mm256_permutevar8x32_epi32((ret.v1), (ord))), + (0x10), + )), + (_mm256_permutevar8x32_epi32((ret.v0), (ord))), + (0x2F), + ); + ret.v0 = _mm256_mul_epu32((x.v0), (t.v0)); + ret.v1 = _mm256_mul_epu32((x.v1), (t.v0)); + ret.v2 = _mm256_mul_epu32((x.v0), (t.v1)); + ret.v3 = _mm256_mul_epu32((x.v1), (t.v1)); + ret.v4 = _mm256_mul_epu32((x.v0), (t.v2)); + ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1); + m.v0 = _mm256_permutevar8x32_epi32((t.v0), (ord)); + m.v1 = _mm256_permutevar8x32_epi32((t.v1), (ord)); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v2), (t.v0)))); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v1), (m.v1)))); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v0), (m.v0)))); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v0), (m.v1)))); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v1), (m.v0)))); + x.v0 = _mm256_permutevar8x32_epi32((x.v0), (ord)); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v0), (t.v0)))); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v0), (m.v0)))); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v0), (t.v1)))); + t.v1 = _mm256_add_epi32((t.v2), (_mm256_slli_epi32((t.v2), (2)))); + m.v1 = _mm256_add_epi32((m.v1), (_mm256_slli_epi32((m.v1), (2)))); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v0), (m.v1)))); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v1), (t.v1)))); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v0), (t.v1)))); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v2), (m.v1)))); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v2), (t.v1)))); + x.v1 = _mm256_permutevar8x32_epi32((x.v1), (ord)); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v1), (m.v1)))); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v1), (t.v1)))); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v1), (t.v0)))); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v1), (m.v0)))); + m.v1 = _mm256_permutevar8x32_epi32((m.v1), (ord)); + t.v1 = _mm256_permutevar8x32_epi32((t.v1), (ord)); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v1), (m.v1)))); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v2), (t.v1)))); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v2), (m.v1)))); + + ret +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn mul4x130R(mut x: vec256x3, y: vec256, z: vec256) -> vec256x5 { + let mut ret: vec256x5 = mem::zeroed(); + let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1); + let mut t0 = _mm256_permute4x64_epi64((y), (((0) << 6) | ((0) << 4) | ((0) << 2) | (0))); + let mut t1 = _mm256_permute4x64_epi64((y), (((1) << 6) | ((1) << 4) | ((1) << 2) | (1))); + ret.v0 = _mm256_mul_epu32((x.v0), (t0)); + ret.v1 = _mm256_mul_epu32((x.v1), (t0)); + ret.v4 = _mm256_mul_epu32((x.v2), (t0)); + ret.v2 = _mm256_mul_epu32((x.v0), (t1)); + ret.v3 = _mm256_mul_epu32((x.v1), (t1)); + t0 = _mm256_permutevar8x32_epi32((t0), (ord)); + t1 = _mm256_permutevar8x32_epi32((t1), (ord)); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v0), (t0)))); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v1), (t0)))); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v0), (t1)))); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v1), (t1)))); + let mut t2 = _mm256_permute4x64_epi64((y), (((2) << 6) | ((2) << 4) | ((2) << 2) | (2))); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v0), (t2)))); + x.v0 = _mm256_permutevar8x32_epi32((x.v0), (ord)); + x.v1 = _mm256_permutevar8x32_epi32((x.v1), (ord)); + t2 = _mm256_permutevar8x32_epi32((t2), (ord)); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v1), (t2)))); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v2), (t2)))); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v0), (t0)))); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v1), (t0)))); + t0 = _mm256_permutevar8x32_epi32((t0), (ord)); + t1 = _mm256_permutevar8x32_epi32((t1), (ord)); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v0), (t0)))); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v1), (t0)))); + ret.v4 = _mm256_add_epi64((ret.v4), (_mm256_mul_epu32((x.v0), (t1)))); + t0 = _mm256_permute4x64_epi64((y), (((3) << 6) | ((3) << 4) | ((3) << 2) | (3))); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v0), (t0)))); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v1), (t0)))); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v2), (t0)))); + t0 = _mm256_permutevar8x32_epi32((t0), (ord)); + ret.v3 = _mm256_add_epi64((ret.v3), (_mm256_mul_epu32((x.v2), (t0)))); + ret.v1 = _mm256_add_epi64((ret.v1), (_mm256_mul_epu32((x.v0), (t0)))); + ret.v2 = _mm256_add_epi64((ret.v2), (_mm256_mul_epu32((x.v1), (t0)))); + x.v1 = _mm256_permutevar8x32_epi32((x.v1), (ord)); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v1), (t0)))); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v2), (z)))); + ret +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn mul4x130P(mut x: vec256x3, y: vec256, z: vec256) -> vec256x5 { + let mut ret: vec256x5 = mem::zeroed(); + let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1); + ret.v0 = _mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + ); + ret.v1 = _mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + ); + ret.v2 = _mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(2, 2, 2, 2)))), + ); + ret.v3 = _mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(3, 3, 3, 3)))), + ); + ret.v4 = _mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(4, 4, 4, 4)))), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(2, 2, 2, 2)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(3, 3, 3, 3)))), + )), + ); + x.v0 = _mm256_permutevar8x32_epi32((x.v0), (ord)); + x.v1 = _mm256_permutevar8x32_epi32((x.v1), (ord)); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(6, 6, 6, 6)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(2, 2, 2, 2)))), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(5, 5, 5, 5)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(6, 6, 6, 6)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + )), + ); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v2), (z)))); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(5, 5, 5, 5)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(6, 6, 6, 6)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + return ret; +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn muladd4x130P(mut ret: vec256x5, mut x: vec256x3, y: vec256, z: vec256) -> vec256x5 { + let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(2, 2, 2, 2)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(3, 3, 3, 3)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(4, 4, 4, 4)))), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(2, 2, 2, 2)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(3, 3, 3, 3)))), + )), + ); + x.v0 = _mm256_permutevar8x32_epi32((x.v0), (ord)); + x.v1 = _mm256_permutevar8x32_epi32((x.v1), (ord)); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(6, 6, 6, 6)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v0), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(2, 2, 2, 2)))), + )), + ); + ret.v0 = _mm256_add_epi64( + (ret.v0), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(5, 5, 5, 5)))), + )), + ); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(6, 6, 6, 6)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v1), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(1, 1, 1, 1)))), + )), + ); + ret.v0 = _mm256_add_epi64((ret.v0), (_mm256_mul_epu32((x.v2), (z)))); + ret.v1 = _mm256_add_epi64( + (ret.v1), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(5, 5, 5, 5)))), + )), + ); + ret.v2 = _mm256_add_epi64( + (ret.v2), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(6, 6, 6, 6)))), + )), + ); + ret.v3 = _mm256_add_epi64( + (ret.v3), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(7, 7, 7, 7)))), + )), + ); + ret.v4 = _mm256_add_epi64( + (ret.v4), + (_mm256_mul_epu32( + (x.v2), + (_mm256_permutevar8x32_epi32((y), (_mm256_set_epi64x(0, 0, 0, 0)))), + )), + ); + ret +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn memcpy63(dst: *mut u8, src: *const u8, count: usize) { + ptr::copy(src, dst, count); +} + +#[inline] +#[target_feature(enable = "avx2")] +unsafe fn memzero15(dst: *mut u8, count: usize) { + ptr::write_bytes(dst, 0, count); +} diff --git a/poly1305/src/lib.rs b/poly1305/src/lib.rs index 55fe3bc..77a2c4d 100644 --- a/poly1305/src/lib.rs +++ b/poly1305/src/lib.rs @@ -1,31 +1,36 @@ //! The Poly1305 universal hash function and message authentication code -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. -// -// This code originates from the rust-crypto project: -// -// -// ...and was originally a port of Andrew Moons poly1305-donna -// https://github.com/floodyberry/poly1305-donna - #![no_std] #![doc(html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo_small.png")] #![warn(missing_docs, rust_2018_idioms)] +#[cfg(not(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2" +)))] +mod soft; + +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2" +))] +mod avx2; + +#[cfg(not(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2" +)))] +pub use self::soft::Poly1305; + +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2" +))] +pub use self::avx2::Poly1305; + pub use universal_hash; -use core::{cmp::min, convert::TryInto}; -use universal_hash::generic_array::{ - typenum::{U16, U32}, - GenericArray, -}; -use universal_hash::{Output, UniversalHash}; -#[cfg(feature = "zeroize")] -use zeroize::Zeroize; +use universal_hash::{generic_array::typenum::U16, Output}; /// Size of a Poly1305 key pub const KEY_SIZE: usize = 32; @@ -41,306 +46,3 @@ pub type Block = [u8; BLOCK_SIZE]; /// Poly1305 tags (16-bytes) pub type Tag = Output; - -/// The Poly1305 universal hash function. -/// -/// Note that Poly1305 is not a traditional MAC and is single-use only -/// (a.k.a. "one-time authenticator"). -/// -/// For this reason it doesn't impl the `crypto_mac::Mac` trait. -#[derive(Clone)] -pub struct Poly1305 { - r: [u32; 5], - h: [u32; 5], - pad: [u32; 4], - leftover: usize, - buffer: Block, -} - -impl UniversalHash for Poly1305 { - type KeySize = U32; - type BlockSize = U16; - - /// Initialize Poly1305 with the given key - fn new(key: &GenericArray) -> Poly1305 { - let mut poly = Poly1305 { - r: [0u32; 5], - h: [0u32; 5], - pad: [0u32; 4], - leftover: 0, - buffer: Block::default(), - }; - - // r &= 0xffffffc0ffffffc0ffffffc0fffffff - poly.r[0] = (u32::from_le_bytes(key[0..4].try_into().unwrap())) & 0x3ff_ffff; - poly.r[1] = (u32::from_le_bytes(key[3..7].try_into().unwrap()) >> 2) & 0x3ff_ff03; - poly.r[2] = (u32::from_le_bytes(key[6..10].try_into().unwrap()) >> 4) & 0x3ff_c0ff; - poly.r[3] = (u32::from_le_bytes(key[9..13].try_into().unwrap()) >> 6) & 0x3f0_3fff; - poly.r[4] = (u32::from_le_bytes(key[12..16].try_into().unwrap()) >> 8) & 0x00f_ffff; - - poly.pad[0] = u32::from_le_bytes(key[16..20].try_into().unwrap()); - poly.pad[1] = u32::from_le_bytes(key[20..24].try_into().unwrap()); - poly.pad[2] = u32::from_le_bytes(key[24..28].try_into().unwrap()); - poly.pad[3] = u32::from_le_bytes(key[28..32].try_into().unwrap()); - - poly - } - - /// Input data into the Poly1305 universal hash function - fn update_block(&mut self, block: &GenericArray) { - // TODO(tarcieri): pass block directly to `Poly1305::compute_block` - self.update(block.as_slice()); - } - - /// Reset internal state - fn reset(&mut self) { - self.h = Default::default(); - self.buffer = Default::default(); - self.leftover = 0; - } - - /// Get the hashed output - fn result(mut self) -> Tag { - if self.leftover > 0 { - self.buffer[self.leftover] = 1; - - for i in (self.leftover + 1)..BLOCK_SIZE { - self.buffer[i] = 0; - } - - self.compute_block(true); - } - - // fully carry h - let mut h0 = self.h[0]; - let mut h1 = self.h[1]; - let mut h2 = self.h[2]; - let mut h3 = self.h[3]; - let mut h4 = self.h[4]; - - let mut c: u32; - c = h1 >> 26; - h1 &= 0x3ff_ffff; - h2 += c; - - c = h2 >> 26; - h2 &= 0x3ff_ffff; - h3 += c; - - c = h3 >> 26; - h3 &= 0x3ff_ffff; - h4 += c; - - c = h4 >> 26; - h4 &= 0x3ff_ffff; - h0 += c * 5; - - c = h0 >> 26; - h0 &= 0x3ff_ffff; - h1 += c; - - // compute h + -p - let mut g0 = h0.wrapping_add(5); - c = g0 >> 26; - g0 &= 0x3ff_ffff; - - let mut g1 = h1.wrapping_add(c); - c = g1 >> 26; - g1 &= 0x3ff_ffff; - - let mut g2 = h2.wrapping_add(c); - c = g2 >> 26; - g2 &= 0x3ff_ffff; - - let mut g3 = h3.wrapping_add(c); - c = g3 >> 26; - g3 &= 0x3ff_ffff; - - let mut g4 = h4.wrapping_add(c).wrapping_sub(1 << 26); - - // select h if h < p, or h + -p if h >= p - let mut mask = (g4 >> (32 - 1)).wrapping_sub(1); - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = !mask; - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - // h = h % (2^128) - h0 |= h1 << 26; - h1 = (h1 >> 6) | (h2 << 20); - h2 = (h2 >> 12) | (h3 << 14); - h3 = (h3 >> 18) | (h4 << 8); - - // h = mac = (h + pad) % (2^128) - let mut f: u64; - f = u64::from(h0) + u64::from(self.pad[0]); - h0 = f as u32; - - f = u64::from(h1) + u64::from(self.pad[1]) + (f >> 32); - h1 = f as u32; - - f = u64::from(h2) + u64::from(self.pad[2]) + (f >> 32); - h2 = f as u32; - - f = u64::from(h3) + u64::from(self.pad[3]) + (f >> 32); - h3 = f as u32; - - let mut tag = GenericArray::default(); - tag[0..4].copy_from_slice(&h0.to_le_bytes()); - tag[4..8].copy_from_slice(&h1.to_le_bytes()); - tag[8..12].copy_from_slice(&h2.to_le_bytes()); - tag[12..16].copy_from_slice(&h3.to_le_bytes()); - - Tag::new(tag) - } -} - -impl Poly1305 { - /// Input data into the Poly1305 universal hash function - pub fn update(&mut self, data: &[u8]) { - let mut m = data; - - if self.leftover > 0 { - let want = min(16 - self.leftover, m.len()); - - for (i, byte) in m.iter().cloned().enumerate().take(want) { - self.buffer[self.leftover + i] = byte; - } - - m = &m[want..]; - self.leftover += want; - - if self.leftover < BLOCK_SIZE { - return; - } - - self.compute_block(false); - self.leftover = 0; - } - - while m.len() >= BLOCK_SIZE { - // TODO(tarcieri): avoid copying data into the buffer here - self.buffer.copy_from_slice(&m[..BLOCK_SIZE]); - self.compute_block(false); - m = &m[BLOCK_SIZE..]; - } - - self.buffer[..m.len()].copy_from_slice(m); - self.leftover = m.len(); - } - - /// Process input messages in a chained manner - pub fn chain(mut self, data: &[u8]) -> Self { - self.update(data); - self - } - - /// Compute a single block of Poly1305 using the internal buffer - fn compute_block(&mut self, finished: bool) { - let hibit = if finished { 0 } else { 1 << 24 }; - - let r0 = self.r[0]; - let r1 = self.r[1]; - let r2 = self.r[2]; - let r3 = self.r[3]; - let r4 = self.r[4]; - - let s1 = r1 * 5; - let s2 = r2 * 5; - let s3 = r3 * 5; - let s4 = r4 * 5; - - let mut h0 = self.h[0]; - let mut h1 = self.h[1]; - let mut h2 = self.h[2]; - let mut h3 = self.h[3]; - let mut h4 = self.h[4]; - - // h += m - h0 += (u32::from_le_bytes(self.buffer[0..4].try_into().unwrap())) & 0x3ff_ffff; - h1 += (u32::from_le_bytes(self.buffer[3..7].try_into().unwrap()) >> 2) & 0x3ff_ffff; - h2 += (u32::from_le_bytes(self.buffer[6..10].try_into().unwrap()) >> 4) & 0x3ff_ffff; - h3 += (u32::from_le_bytes(self.buffer[9..13].try_into().unwrap()) >> 6) & 0x3ff_ffff; - h4 += (u32::from_le_bytes(self.buffer[12..16].try_into().unwrap()) >> 8) | hibit; - - // h *= r - let d0 = (u64::from(h0) * u64::from(r0)) - + (u64::from(h1) * u64::from(s4)) - + (u64::from(h2) * u64::from(s3)) - + (u64::from(h3) * u64::from(s2)) - + (u64::from(h4) * u64::from(s1)); - - let mut d1 = (u64::from(h0) * u64::from(r1)) - + (u64::from(h1) * u64::from(r0)) - + (u64::from(h2) * u64::from(s4)) - + (u64::from(h3) * u64::from(s3)) - + (u64::from(h4) * u64::from(s2)); - - let mut d2 = (u64::from(h0) * u64::from(r2)) - + (u64::from(h1) * u64::from(r1)) - + (u64::from(h2) * u64::from(r0)) - + (u64::from(h3) * u64::from(s4)) - + (u64::from(h4) * u64::from(s3)); - - let mut d3 = (u64::from(h0) * u64::from(r3)) - + (u64::from(h1) * u64::from(r2)) - + (u64::from(h2) * u64::from(r1)) - + (u64::from(h3) * u64::from(r0)) - + (u64::from(h4) * u64::from(s4)); - - let mut d4 = (u64::from(h0) * u64::from(r4)) - + (u64::from(h1) * u64::from(r3)) - + (u64::from(h2) * u64::from(r2)) - + (u64::from(h3) * u64::from(r1)) - + (u64::from(h4) * u64::from(r0)); - - // (partial) h %= p - let mut c: u32; - c = (d0 >> 26) as u32; - h0 = d0 as u32 & 0x3ff_ffff; - d1 += u64::from(c); - - c = (d1 >> 26) as u32; - h1 = d1 as u32 & 0x3ff_ffff; - d2 += u64::from(c); - - c = (d2 >> 26) as u32; - h2 = d2 as u32 & 0x3ff_ffff; - d3 += u64::from(c); - - c = (d3 >> 26) as u32; - h3 = d3 as u32 & 0x3ff_ffff; - d4 += u64::from(c); - - c = (d4 >> 26) as u32; - h4 = d4 as u32 & 0x3ff_ffff; - h0 += c * 5; - - c = h0 >> 26; - h0 &= 0x3ff_ffff; - h1 += c; - - self.h[0] = h0; - self.h[1] = h1; - self.h[2] = h2; - self.h[3] = h3; - self.h[4] = h4; - } -} - -#[cfg(feature = "zeroize")] -impl Drop for Poly1305 { - fn drop(&mut self) { - self.r.zeroize(); - self.h.zeroize(); - self.pad.zeroize(); - self.buffer.zeroize(); - } -} diff --git a/poly1305/src/soft.rs b/poly1305/src/soft.rs new file mode 100644 index 0000000..9b8cc33 --- /dev/null +++ b/poly1305/src/soft.rs @@ -0,0 +1,326 @@ +//! Portable software implementation of the Poly1305 universal hash function. + +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// This code originates from the rust-crypto project: +// +// +// ...and was originally a port of Andrew Moons poly1305-donna +// https://github.com/floodyberry/poly1305-donna + +use super::{Block, Tag, BLOCK_SIZE}; +use core::{cmp::min, convert::TryInto}; +use universal_hash::generic_array::{ + typenum::{U16, U32}, + GenericArray, +}; +use universal_hash::UniversalHash; +#[cfg(feature = "zeroize")] +use zeroize::Zeroize; + +/// The Poly1305 universal hash function. +/// +/// Note that Poly1305 is not a traditional MAC and is single-use only +/// (a.k.a. "one-time authenticator"). +/// +/// For this reason it doesn't impl the `crypto_mac::Mac` trait. +#[derive(Clone)] +pub struct Poly1305 { + r: [u32; 5], + h: [u32; 5], + pad: [u32; 4], + leftover: usize, + buffer: Block, +} + +impl UniversalHash for Poly1305 { + type KeySize = U32; + type BlockSize = U16; + + /// Initialize Poly1305 with the given key + fn new(key: &GenericArray) -> Poly1305 { + let mut poly = Poly1305 { + r: [0u32; 5], + h: [0u32; 5], + pad: [0u32; 4], + leftover: 0, + buffer: Block::default(), + }; + + // r &= 0xffffffc0ffffffc0ffffffc0fffffff + poly.r[0] = (u32::from_le_bytes(key[0..4].try_into().unwrap())) & 0x3ff_ffff; + poly.r[1] = (u32::from_le_bytes(key[3..7].try_into().unwrap()) >> 2) & 0x3ff_ff03; + poly.r[2] = (u32::from_le_bytes(key[6..10].try_into().unwrap()) >> 4) & 0x3ff_c0ff; + poly.r[3] = (u32::from_le_bytes(key[9..13].try_into().unwrap()) >> 6) & 0x3f0_3fff; + poly.r[4] = (u32::from_le_bytes(key[12..16].try_into().unwrap()) >> 8) & 0x00f_ffff; + + poly.pad[0] = u32::from_le_bytes(key[16..20].try_into().unwrap()); + poly.pad[1] = u32::from_le_bytes(key[20..24].try_into().unwrap()); + poly.pad[2] = u32::from_le_bytes(key[24..28].try_into().unwrap()); + poly.pad[3] = u32::from_le_bytes(key[28..32].try_into().unwrap()); + + poly + } + + /// Input data into the Poly1305 universal hash function + fn update_block(&mut self, block: &GenericArray) { + // TODO(tarcieri): pass block directly to `Poly1305::compute_block` + self.update(block.as_slice()); + } + + /// Reset internal state + fn reset(&mut self) { + self.h = Default::default(); + self.buffer = Default::default(); + self.leftover = 0; + } + + /// Get the hashed output + fn result(mut self) -> Tag { + if self.leftover > 0 { + self.buffer[self.leftover] = 1; + + for i in (self.leftover + 1)..BLOCK_SIZE { + self.buffer[i] = 0; + } + + self.compute_block(true); + } + + // fully carry h + let mut h0 = self.h[0]; + let mut h1 = self.h[1]; + let mut h2 = self.h[2]; + let mut h3 = self.h[3]; + let mut h4 = self.h[4]; + + let mut c: u32; + c = h1 >> 26; + h1 &= 0x3ff_ffff; + h2 += c; + + c = h2 >> 26; + h2 &= 0x3ff_ffff; + h3 += c; + + c = h3 >> 26; + h3 &= 0x3ff_ffff; + h4 += c; + + c = h4 >> 26; + h4 &= 0x3ff_ffff; + h0 += c * 5; + + c = h0 >> 26; + h0 &= 0x3ff_ffff; + h1 += c; + + // compute h + -p + let mut g0 = h0.wrapping_add(5); + c = g0 >> 26; + g0 &= 0x3ff_ffff; + + let mut g1 = h1.wrapping_add(c); + c = g1 >> 26; + g1 &= 0x3ff_ffff; + + let mut g2 = h2.wrapping_add(c); + c = g2 >> 26; + g2 &= 0x3ff_ffff; + + let mut g3 = h3.wrapping_add(c); + c = g3 >> 26; + g3 &= 0x3ff_ffff; + + let mut g4 = h4.wrapping_add(c).wrapping_sub(1 << 26); + + // select h if h < p, or h + -p if h >= p + let mut mask = (g4 >> (32 - 1)).wrapping_sub(1); + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = !mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + // h = h % (2^128) + h0 |= h1 << 26; + h1 = (h1 >> 6) | (h2 << 20); + h2 = (h2 >> 12) | (h3 << 14); + h3 = (h3 >> 18) | (h4 << 8); + + // h = mac = (h + pad) % (2^128) + let mut f: u64; + f = u64::from(h0) + u64::from(self.pad[0]); + h0 = f as u32; + + f = u64::from(h1) + u64::from(self.pad[1]) + (f >> 32); + h1 = f as u32; + + f = u64::from(h2) + u64::from(self.pad[2]) + (f >> 32); + h2 = f as u32; + + f = u64::from(h3) + u64::from(self.pad[3]) + (f >> 32); + h3 = f as u32; + + let mut tag = GenericArray::default(); + tag[0..4].copy_from_slice(&h0.to_le_bytes()); + tag[4..8].copy_from_slice(&h1.to_le_bytes()); + tag[8..12].copy_from_slice(&h2.to_le_bytes()); + tag[12..16].copy_from_slice(&h3.to_le_bytes()); + + Tag::new(tag) + } +} + +impl Poly1305 { + /// Input data into the Poly1305 universal hash function + pub fn update(&mut self, data: &[u8]) { + let mut m = data; + + if self.leftover > 0 { + let want = min(16 - self.leftover, m.len()); + + for (i, byte) in m.iter().cloned().enumerate().take(want) { + self.buffer[self.leftover + i] = byte; + } + + m = &m[want..]; + self.leftover += want; + + if self.leftover < BLOCK_SIZE { + return; + } + + self.compute_block(false); + self.leftover = 0; + } + + while m.len() >= BLOCK_SIZE { + // TODO(tarcieri): avoid copying data into the buffer here + self.buffer.copy_from_slice(&m[..BLOCK_SIZE]); + self.compute_block(false); + m = &m[BLOCK_SIZE..]; + } + + self.buffer[..m.len()].copy_from_slice(m); + self.leftover = m.len(); + } + + /// Process input messages in a chained manner + pub fn chain(mut self, data: &[u8]) -> Self { + self.update(data); + self + } + + /// Compute a single block of Poly1305 using the internal buffer + fn compute_block(&mut self, finished: bool) { + let hibit = if finished { 0 } else { 1 << 24 }; + + let r0 = self.r[0]; + let r1 = self.r[1]; + let r2 = self.r[2]; + let r3 = self.r[3]; + let r4 = self.r[4]; + + let s1 = r1 * 5; + let s2 = r2 * 5; + let s3 = r3 * 5; + let s4 = r4 * 5; + + let mut h0 = self.h[0]; + let mut h1 = self.h[1]; + let mut h2 = self.h[2]; + let mut h3 = self.h[3]; + let mut h4 = self.h[4]; + + // h += m + h0 += (u32::from_le_bytes(self.buffer[0..4].try_into().unwrap())) & 0x3ff_ffff; + h1 += (u32::from_le_bytes(self.buffer[3..7].try_into().unwrap()) >> 2) & 0x3ff_ffff; + h2 += (u32::from_le_bytes(self.buffer[6..10].try_into().unwrap()) >> 4) & 0x3ff_ffff; + h3 += (u32::from_le_bytes(self.buffer[9..13].try_into().unwrap()) >> 6) & 0x3ff_ffff; + h4 += (u32::from_le_bytes(self.buffer[12..16].try_into().unwrap()) >> 8) | hibit; + + // h *= r + let d0 = (u64::from(h0) * u64::from(r0)) + + (u64::from(h1) * u64::from(s4)) + + (u64::from(h2) * u64::from(s3)) + + (u64::from(h3) * u64::from(s2)) + + (u64::from(h4) * u64::from(s1)); + + let mut d1 = (u64::from(h0) * u64::from(r1)) + + (u64::from(h1) * u64::from(r0)) + + (u64::from(h2) * u64::from(s4)) + + (u64::from(h3) * u64::from(s3)) + + (u64::from(h4) * u64::from(s2)); + + let mut d2 = (u64::from(h0) * u64::from(r2)) + + (u64::from(h1) * u64::from(r1)) + + (u64::from(h2) * u64::from(r0)) + + (u64::from(h3) * u64::from(s4)) + + (u64::from(h4) * u64::from(s3)); + + let mut d3 = (u64::from(h0) * u64::from(r3)) + + (u64::from(h1) * u64::from(r2)) + + (u64::from(h2) * u64::from(r1)) + + (u64::from(h3) * u64::from(r0)) + + (u64::from(h4) * u64::from(s4)); + + let mut d4 = (u64::from(h0) * u64::from(r4)) + + (u64::from(h1) * u64::from(r3)) + + (u64::from(h2) * u64::from(r2)) + + (u64::from(h3) * u64::from(r1)) + + (u64::from(h4) * u64::from(r0)); + + // (partial) h %= p + let mut c: u32; + c = (d0 >> 26) as u32; + h0 = d0 as u32 & 0x3ff_ffff; + d1 += u64::from(c); + + c = (d1 >> 26) as u32; + h1 = d1 as u32 & 0x3ff_ffff; + d2 += u64::from(c); + + c = (d2 >> 26) as u32; + h2 = d2 as u32 & 0x3ff_ffff; + d3 += u64::from(c); + + c = (d3 >> 26) as u32; + h3 = d3 as u32 & 0x3ff_ffff; + d4 += u64::from(c); + + c = (d4 >> 26) as u32; + h4 = d4 as u32 & 0x3ff_ffff; + h0 += c * 5; + + c = h0 >> 26; + h0 &= 0x3ff_ffff; + h1 += c; + + self.h[0] = h0; + self.h[1] = h1; + self.h[2] = h2; + self.h[3] = h3; + self.h[4] = h4; + } +} + +#[cfg(feature = "zeroize")] +impl Drop for Poly1305 { + fn drop(&mut self) { + self.r.zeroize(); + self.h.zeroize(); + self.pad.zeroize(); + self.buffer.zeroize(); + } +}