poly1305/backend/avx2.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
//! AVX2 implementation of the Poly1305 state machine.
// The State struct and its logic was originally derived from Goll and Gueron's AVX2 C
// code:
// [Vectorization of Poly1305 message authentication code](https://ieeexplore.ieee.org/document/7113463)
//
// which was sourced from Bhattacharyya and Sarkar's modified variant:
// [Improved SIMD Implementation of Poly1305](https://eprint.iacr.org/2019/842)
// https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305
//
// The logic has been extensively rewritten and documented, and several bugs in the
// original C code were fixed.
//
// Note that State only implements the original Goll-Gueron algorithm, not the
// optimisations provided by Bhattacharyya and Sarkar. The latter require the message
// length to be known, which is incompatible with the streaming API of UniversalHash.
use universal_hash::{
consts::{U16, U4},
crypto_common::{BlockSizeUser, ParBlocksSizeUser},
generic_array::GenericArray,
UhfBackend,
};
use crate::{Block, Key, Tag};
mod helpers;
use self::helpers::*;
/// Four Poly1305 blocks (64-bytes)
type ParBlocks = universal_hash::ParBlocks<State>;
#[derive(Copy, Clone)]
struct Initialized {
p: Aligned4x130,
m: SpacedMultiplier4x130,
r4: PrecomputedMultiplier,
}
#[derive(Clone)]
pub(crate) struct State {
k: AdditionKey,
r1: PrecomputedMultiplier,
r2: PrecomputedMultiplier,
initialized: Option<Initialized>,
cached_blocks: [Block; 4],
num_cached_blocks: usize,
partial_block: Option<Block>,
}
impl State {
/// Initialize Poly1305 [`State`] with the given key
pub(crate) fn new(key: &Key) -> Self {
// Prepare addition key and polynomial key.
let (k, r1) = unsafe { prepare_keys(key) };
// Precompute R^2.
let r2 = (r1 * r1).reduce();
State {
k,
r1,
r2: r2.into(),
initialized: None,
cached_blocks: [Block::default(); 4],
num_cached_blocks: 0,
partial_block: None,
}
}
/// Process four Poly1305 blocks at once.
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn compute_par_blocks(&mut self, blocks: &ParBlocks) {
assert!(self.partial_block.is_none());
assert_eq!(self.num_cached_blocks, 0);
self.process_blocks(Aligned4x130::from_par_blocks(blocks));
}
/// Compute a Poly1305 block
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn compute_block(&mut self, block: &Block, partial: bool) {
// We can cache a single partial block.
if partial {
assert!(self.partial_block.is_none());
self.partial_block = Some(*block);
return;
}
self.cached_blocks[self.num_cached_blocks].copy_from_slice(block);
if self.num_cached_blocks < 3 {
self.num_cached_blocks += 1;
return;
} else {
self.num_cached_blocks = 0;
}
self.process_blocks(Aligned4x130::from_blocks(&self.cached_blocks));
}
/// Compute a Poly1305 block
#[target_feature(enable = "avx2")]
unsafe fn process_blocks(&mut self, blocks: Aligned4x130) {
if let Some(inner) = &mut self.initialized {
// P <-- R^4 * P + blocks
inner.p = (&inner.p * inner.r4).reduce() + blocks;
} else {
// Initialize the polynomial.
let p = blocks;
// Initialize the multiplier (used to merge down the polynomial during
// finalization).
let (m, r4) = SpacedMultiplier4x130::new(self.r1, self.r2);
self.initialized = Some(Initialized { p, m, r4 })
}
}
/// Finalize output producing a [`Tag`]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn finalize(&mut self) -> Tag {
assert!(self.num_cached_blocks < 4);
let mut data = &self.cached_blocks[..];
// T ← R◦T
// P = T_0 + T_1 + T_2 + T_3
let mut p = self
.initialized
.take()
.map(|inner| (inner.p * inner.m).sum().reduce());
if self.num_cached_blocks >= 2 {
// Compute 32 byte block (remaining data < 64 bytes)
let mut c = Aligned2x130::from_blocks(data[..2].try_into().unwrap());
if let Some(p) = p {
c = c + p;
}
p = Some(c.mul_and_sum(self.r1, self.r2).reduce());
data = &data[2..];
self.num_cached_blocks -= 2;
}
if self.num_cached_blocks == 1 {
// Compute 16 byte block (remaining data < 32 bytes)
let mut c = Aligned130::from_block(&data[0]);
if let Some(p) = p {
c = c + p;
}
p = Some((c * self.r1).reduce());
self.num_cached_blocks -= 1;
}
if let Some(block) = &self.partial_block {
// Compute last block (remaining data < 16 bytes)
let mut c = Aligned130::from_partial_block(block);
if let Some(p) = p {
c = c + p;
}
p = Some((c * self.r1).reduce());
}
// Compute tag: p + k mod 2^128
let mut tag = GenericArray::<u8, _>::default();
let tag_int = if let Some(p) = p {
self.k + p
} else {
self.k.into()
};
tag_int.write(tag.as_mut_slice());
tag
}
}
impl BlockSizeUser for State {
type BlockSize = U16;
}
impl ParBlocksSizeUser for State {
type ParBlocksSize = U4;
}
impl UhfBackend for State {
fn proc_block(&mut self, block: &Block) {
unsafe { self.compute_block(block, false) };
}
fn proc_par_blocks(&mut self, blocks: &ParBlocks) {
if self.num_cached_blocks == 0 {
// Fast path.
unsafe { self.compute_par_blocks(blocks) };
} else {
// We are unaligned; use the slow fallback.
for block in blocks {
self.proc_block(block);
}
}
}
fn blocks_needed_to_align(&self) -> usize {
if self.num_cached_blocks == 0 {
// There are no cached blocks; fast path is available.
0
} else {
// There are cached blocks; report how many more we need.
self.cached_blocks.len() - self.num_cached_blocks
}
}
}