mirror of
https://github.com/ziglang/zig.git
synced 2025-12-24 23:23:07 +00:00
std/crypto: vectorize BLAKE3
Gives a ~40% speedup on x86_64. However, the generic code remains faster on aarch64. This is still processing only one block at a time for now. I'm pretty confident that processing more blocks per round will eventually give a substantial performance improvement on all platforms with vector units.
This commit is contained in:
parent
1b4ab749cf
commit
72064eba23
@ -11,6 +11,7 @@ const fmt = std.fmt;
|
|||||||
const math = std.math;
|
const math = std.math;
|
||||||
const mem = std.mem;
|
const mem = std.mem;
|
||||||
const testing = std.testing;
|
const testing = std.testing;
|
||||||
|
const Vector = std.meta.Vector;
|
||||||
|
|
||||||
const ChunkIterator = struct {
|
const ChunkIterator = struct {
|
||||||
slice: []u8,
|
slice: []u8,
|
||||||
@ -61,87 +62,173 @@ const KEYED_HASH: u8 = 1 << 4;
|
|||||||
const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
|
const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
|
||||||
const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
|
const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
|
||||||
|
|
||||||
// The mixing function, G, which mixes either a column or a diagonal.
|
const CompressVectorized = struct {
|
||||||
fn g(state: *[16]u32, a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) void {
|
const Lane = Vector(4, u32);
|
||||||
_ = @addWithOverflow(u32, state[a], state[b], &state[a]);
|
const Rows = [4]Lane;
|
||||||
_ = @addWithOverflow(u32, state[a], mx, &state[a]);
|
|
||||||
state[d] = math.rotr(u32, state[d] ^ state[a], 16);
|
|
||||||
_ = @addWithOverflow(u32, state[c], state[d], &state[c]);
|
|
||||||
state[b] = math.rotr(u32, state[b] ^ state[c], 12);
|
|
||||||
_ = @addWithOverflow(u32, state[a], state[b], &state[a]);
|
|
||||||
_ = @addWithOverflow(u32, state[a], my, &state[a]);
|
|
||||||
state[d] = math.rotr(u32, state[d] ^ state[a], 8);
|
|
||||||
_ = @addWithOverflow(u32, state[c], state[d], &state[c]);
|
|
||||||
state[b] = math.rotr(u32, state[b] ^ state[c], 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn round(state: *[16]u32, msg: [16]u32, schedule: [16]u8) void {
|
inline fn rot(x: Lane, comptime n: u5) Lane {
|
||||||
// Mix the columns.
|
return (x >> @splat(4, @as(u5, n))) | (x << @splat(4, @as(u5, 1 +% ~n)));
|
||||||
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
|
|
||||||
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
|
|
||||||
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
|
|
||||||
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
|
|
||||||
|
|
||||||
// Mix the diagonals.
|
|
||||||
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
|
|
||||||
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
|
|
||||||
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
|
|
||||||
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn compress(
|
|
||||||
chaining_value: [8]u32,
|
|
||||||
block_words: [16]u32,
|
|
||||||
block_len: u32,
|
|
||||||
counter: u64,
|
|
||||||
flags: u8,
|
|
||||||
) [16]u32 {
|
|
||||||
var state = [16]u32{
|
|
||||||
chaining_value[0],
|
|
||||||
chaining_value[1],
|
|
||||||
chaining_value[2],
|
|
||||||
chaining_value[3],
|
|
||||||
chaining_value[4],
|
|
||||||
chaining_value[5],
|
|
||||||
chaining_value[6],
|
|
||||||
chaining_value[7],
|
|
||||||
IV[0],
|
|
||||||
IV[1],
|
|
||||||
IV[2],
|
|
||||||
IV[3],
|
|
||||||
@truncate(u32, counter),
|
|
||||||
@truncate(u32, counter >> 32),
|
|
||||||
block_len,
|
|
||||||
flags,
|
|
||||||
};
|
|
||||||
for (MSG_SCHEDULE) |schedule| {
|
|
||||||
round(&state, block_words, schedule);
|
|
||||||
}
|
}
|
||||||
for (chaining_value) |_, i| {
|
|
||||||
state[i] ^= state[i + 8];
|
inline fn g(comptime even: bool, rows: *Rows, m: Lane) void {
|
||||||
state[i + 8] ^= chaining_value[i];
|
rows[0] +%= rows[1] +% m;
|
||||||
|
rows[3] ^= rows[0];
|
||||||
|
rows[3] = rot(rows[3], if (even) 8 else 16);
|
||||||
|
rows[2] +%= rows[3];
|
||||||
|
rows[1] ^= rows[2];
|
||||||
|
rows[1] = rot(rows[1], if (even) 7 else 12);
|
||||||
}
|
}
|
||||||
return state;
|
|
||||||
}
|
inline fn diagonalize(rows: *Rows) void {
|
||||||
|
rows[0] = @shuffle(u32, rows[0], undefined, [_]i32{ 3, 0, 1, 2 });
|
||||||
|
rows[3] = @shuffle(u32, rows[3], undefined, [_]i32{ 2, 3, 0, 1 });
|
||||||
|
rows[2] = @shuffle(u32, rows[2], undefined, [_]i32{ 1, 2, 3, 0 });
|
||||||
|
}
|
||||||
|
|
||||||
|
inline fn undiagonalize(rows: *Rows) void {
|
||||||
|
rows[0] = @shuffle(u32, rows[0], undefined, [_]i32{ 1, 2, 3, 0 });
|
||||||
|
rows[3] = @shuffle(u32, rows[3], undefined, [_]i32{ 2, 3, 0, 1 });
|
||||||
|
rows[2] = @shuffle(u32, rows[2], undefined, [_]i32{ 3, 0, 1, 2 });
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compress(
|
||||||
|
chaining_value: [8]u32,
|
||||||
|
block_words: [16]u32,
|
||||||
|
block_len: u32,
|
||||||
|
counter: u64,
|
||||||
|
flags: u8,
|
||||||
|
) [16]u32 {
|
||||||
|
const md = Lane{ @truncate(u32, counter), @truncate(u32, counter >> 32), block_len, @as(u32, flags) };
|
||||||
|
var rows = Rows{ chaining_value[0..4].*, chaining_value[4..8].*, IV[0..4].*, md };
|
||||||
|
|
||||||
|
var m = Rows{ block_words[0..4].*, block_words[4..8].*, block_words[8..12].*, block_words[12..16].* };
|
||||||
|
var t0 = @shuffle(u32, m[0], m[1], [_]i32{ 0, 2, (-1 - 0), (-1 - 2) });
|
||||||
|
g(false, &rows, t0);
|
||||||
|
var t1 = @shuffle(u32, m[0], m[1], [_]i32{ 1, 3, (-1 - 1), (-1 - 3) });
|
||||||
|
g(true, &rows, t1);
|
||||||
|
diagonalize(&rows);
|
||||||
|
var t2 = @shuffle(u32, m[2], m[3], [_]i32{ 0, 2, (-1 - 0), (-1 - 2) });
|
||||||
|
t2 = @shuffle(u32, t2, undefined, [_]i32{ 3, 0, 1, 2 });
|
||||||
|
g(false, &rows, t2);
|
||||||
|
var t3 = @shuffle(u32, m[2], m[3], [_]i32{ 1, 3, (-1 - 1), (-1 - 3) });
|
||||||
|
t3 = @shuffle(u32, t3, undefined, [_]i32{ 3, 0, 1, 2 });
|
||||||
|
g(true, &rows, t3);
|
||||||
|
undiagonalize(&rows);
|
||||||
|
m = Rows{ t0, t1, t2, t3 };
|
||||||
|
|
||||||
|
var i: usize = 0;
|
||||||
|
while (i < 6) : (i += 1) {
|
||||||
|
t0 = @shuffle(u32, m[0], m[1], [_]i32{ 2, 1, (-1 - 1), (-1 - 3) });
|
||||||
|
t0 = @shuffle(u32, t0, undefined, [_]i32{ 1, 2, 3, 0 });
|
||||||
|
g(false, &rows, t0);
|
||||||
|
t1 = @shuffle(u32, m[2], m[3], [_]i32{ 2, 2, (-1 - 3), (-1 - 3) });
|
||||||
|
var tt = @shuffle(u32, m[0], undefined, [_]i32{ 3, 3, 0, 0 });
|
||||||
|
t1 = @shuffle(u32, tt, t1, [_]i32{ 0, (-1 - 1), 2, (-1 - 3) });
|
||||||
|
g(true, &rows, t1);
|
||||||
|
diagonalize(&rows);
|
||||||
|
t2 = @shuffle(u32, m[3], m[1], [_]i32{ 0, 1, (-1 - 0), (-1 - 1) });
|
||||||
|
tt = @shuffle(u32, t2, m[2], [_]i32{ 0, 1, 2, (-1 - 3) });
|
||||||
|
t2 = @shuffle(u32, tt, undefined, [_]i32{ 0, 2, 3, 1 });
|
||||||
|
g(false, &rows, t2);
|
||||||
|
t3 = @shuffle(u32, m[1], m[3], [_]i32{ 2, (-1 - 2), 3, (-1 - 3) });
|
||||||
|
tt = @shuffle(u32, m[2], t3, [_]i32{ 0, (-1 - 0), 1, (-1 - 1) });
|
||||||
|
t3 = @shuffle(u32, tt, undefined, [_]i32{ 2, 3, 1, 0 });
|
||||||
|
g(true, &rows, t3);
|
||||||
|
undiagonalize(&rows);
|
||||||
|
m = Rows{ t0, t1, t2, t3 };
|
||||||
|
}
|
||||||
|
|
||||||
|
rows[0] ^= rows[2];
|
||||||
|
rows[1] ^= rows[3];
|
||||||
|
rows[2] ^= Vector(4, u32){ chaining_value[0], chaining_value[1], chaining_value[2], chaining_value[3] };
|
||||||
|
rows[3] ^= Vector(4, u32){ chaining_value[4], chaining_value[5], chaining_value[6], chaining_value[7] };
|
||||||
|
|
||||||
|
return @bitCast([16]u32, rows);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const CompressGeneric = struct {
|
||||||
|
fn g(state: *[16]u32, comptime a: usize, comptime b: usize, comptime c: usize, comptime d: usize, mx: u32, my: u32) void {
|
||||||
|
state[a] +%= state[b] +% mx;
|
||||||
|
state[d] = math.rotr(u32, state[d] ^ state[a], 16);
|
||||||
|
state[c] +%= state[d];
|
||||||
|
state[b] = math.rotr(u32, state[b] ^ state[c], 12);
|
||||||
|
state[a] +%= state[b] +% my;
|
||||||
|
state[d] = math.rotr(u32, state[d] ^ state[a], 8);
|
||||||
|
state[c] +%= state[d];
|
||||||
|
state[b] = math.rotr(u32, state[b] ^ state[c], 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn round(state: *[16]u32, msg: [16]u32, schedule: [16]u8) void {
|
||||||
|
// Mix the columns.
|
||||||
|
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
|
||||||
|
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
|
||||||
|
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
|
||||||
|
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
|
||||||
|
|
||||||
|
// Mix the diagonals.
|
||||||
|
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
|
||||||
|
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
|
||||||
|
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
|
||||||
|
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compress(
|
||||||
|
chaining_value: [8]u32,
|
||||||
|
block_words: [16]u32,
|
||||||
|
block_len: u32,
|
||||||
|
counter: u64,
|
||||||
|
flags: u8,
|
||||||
|
) [16]u32 {
|
||||||
|
var state = [16]u32{
|
||||||
|
chaining_value[0],
|
||||||
|
chaining_value[1],
|
||||||
|
chaining_value[2],
|
||||||
|
chaining_value[3],
|
||||||
|
chaining_value[4],
|
||||||
|
chaining_value[5],
|
||||||
|
chaining_value[6],
|
||||||
|
chaining_value[7],
|
||||||
|
IV[0],
|
||||||
|
IV[1],
|
||||||
|
IV[2],
|
||||||
|
IV[3],
|
||||||
|
@truncate(u32, counter),
|
||||||
|
@truncate(u32, counter >> 32),
|
||||||
|
block_len,
|
||||||
|
flags,
|
||||||
|
};
|
||||||
|
for (MSG_SCHEDULE) |schedule| {
|
||||||
|
round(&state, block_words, schedule);
|
||||||
|
}
|
||||||
|
for (chaining_value) |_, i| {
|
||||||
|
state[i] ^= state[i + 8];
|
||||||
|
state[i + 8] ^= chaining_value[i];
|
||||||
|
}
|
||||||
|
return state;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const compress = if (std.Target.current.cpu.arch == .x86_64) CompressVectorized.compress else CompressGeneric.compress;
|
||||||
|
|
||||||
fn first8Words(words: [16]u32) [8]u32 {
|
fn first8Words(words: [16]u32) [8]u32 {
|
||||||
return @ptrCast(*const [8]u32, &words).*;
|
return @ptrCast(*const [8]u32, &words).*;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wordsFromLittleEndianBytes(words: []u32, bytes: []const u8) void {
|
fn wordsFromLittleEndianBytes(comptime count: usize, bytes: [count * 4]u8) [count]u32 {
|
||||||
var byte_slice = bytes;
|
var words: [count]u32 = undefined;
|
||||||
for (words) |*word| {
|
for (words) |*word, i| {
|
||||||
word.* = mem.readIntSliceLittle(u32, byte_slice);
|
word.* = mem.readIntSliceLittle(u32, bytes[4 * i ..]);
|
||||||
byte_slice = byte_slice[4..];
|
|
||||||
}
|
}
|
||||||
|
return words;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Each chunk or parent node can produce either an 8-word chaining value or, by
|
// Each chunk or parent node can produce either an 8-word chaining value or, by
|
||||||
// setting the ROOT flag, any number of final output bytes. The Output struct
|
// setting the ROOT flag, any number of final output bytes. The Output struct
|
||||||
// captures the state just prior to choosing between those two possibilities.
|
// captures the state just prior to choosing between those two possibilities.
|
||||||
const Output = struct {
|
const Output = struct {
|
||||||
input_chaining_value: [8]u32,
|
input_chaining_value: [8]u32 align(16),
|
||||||
block_words: [16]u32,
|
block_words: [16]u32 align(16),
|
||||||
block_len: u32,
|
block_len: u32,
|
||||||
counter: u64,
|
counter: u64,
|
||||||
flags: u8,
|
flags: u8,
|
||||||
@ -181,9 +268,9 @@ const Output = struct {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const ChunkState = struct {
|
const ChunkState = struct {
|
||||||
chaining_value: [8]u32,
|
chaining_value: [8]u32 align(16),
|
||||||
chunk_counter: u64,
|
chunk_counter: u64,
|
||||||
block: [BLOCK_LEN]u8 = [_]u8{0} ** BLOCK_LEN,
|
block: [BLOCK_LEN]u8 align(16) = [_]u8{0} ** BLOCK_LEN,
|
||||||
block_len: u8 = 0,
|
block_len: u8 = 0,
|
||||||
blocks_compressed: u8 = 0,
|
blocks_compressed: u8 = 0,
|
||||||
flags: u8,
|
flags: u8,
|
||||||
@ -218,8 +305,7 @@ const ChunkState = struct {
|
|||||||
// If the block buffer is full, compress it and clear it. More
|
// If the block buffer is full, compress it and clear it. More
|
||||||
// input is coming, so this compression is not CHUNK_END.
|
// input is coming, so this compression is not CHUNK_END.
|
||||||
if (self.block_len == BLOCK_LEN) {
|
if (self.block_len == BLOCK_LEN) {
|
||||||
var block_words: [16]u32 = undefined;
|
const block_words = wordsFromLittleEndianBytes(16, self.block);
|
||||||
wordsFromLittleEndianBytes(block_words[0..], self.block[0..]);
|
|
||||||
self.chaining_value = first8Words(compress(
|
self.chaining_value = first8Words(compress(
|
||||||
self.chaining_value,
|
self.chaining_value,
|
||||||
block_words,
|
block_words,
|
||||||
@ -238,8 +324,7 @@ const ChunkState = struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn output(self: *const ChunkState) Output {
|
fn output(self: *const ChunkState) Output {
|
||||||
var block_words: [16]u32 = undefined;
|
const block_words = wordsFromLittleEndianBytes(16, self.block);
|
||||||
wordsFromLittleEndianBytes(block_words[0..], self.block[0..]);
|
|
||||||
return Output{
|
return Output{
|
||||||
.input_chaining_value = self.chaining_value,
|
.input_chaining_value = self.chaining_value,
|
||||||
.block_words = block_words,
|
.block_words = block_words,
|
||||||
@ -256,7 +341,7 @@ fn parentOutput(
|
|||||||
key: [8]u32,
|
key: [8]u32,
|
||||||
flags: u8,
|
flags: u8,
|
||||||
) Output {
|
) Output {
|
||||||
var block_words: [16]u32 = undefined;
|
var block_words: [16]u32 align(16) = undefined;
|
||||||
mem.copy(u32, block_words[0..8], left_child_cv[0..]);
|
mem.copy(u32, block_words[0..8], left_child_cv[0..]);
|
||||||
mem.copy(u32, block_words[8..], right_child_cv[0..]);
|
mem.copy(u32, block_words[8..], right_child_cv[0..]);
|
||||||
return Output{
|
return Output{
|
||||||
@ -303,8 +388,7 @@ pub const Blake3 = struct {
|
|||||||
/// Construct a new `Blake3` for the hash function, with an optional key
|
/// Construct a new `Blake3` for the hash function, with an optional key
|
||||||
pub fn init(options: Options) Blake3 {
|
pub fn init(options: Options) Blake3 {
|
||||||
if (options.key) |key| {
|
if (options.key) |key| {
|
||||||
var key_words: [8]u32 = undefined;
|
const key_words = wordsFromLittleEndianBytes(8, key);
|
||||||
wordsFromLittleEndianBytes(key_words[0..], key[0..]);
|
|
||||||
return Blake3.init_internal(key_words, KEYED_HASH);
|
return Blake3.init_internal(key_words, KEYED_HASH);
|
||||||
} else {
|
} else {
|
||||||
return Blake3.init_internal(IV, 0);
|
return Blake3.init_internal(IV, 0);
|
||||||
@ -318,8 +402,7 @@ pub const Blake3 = struct {
|
|||||||
context_hasher.update(context);
|
context_hasher.update(context);
|
||||||
var context_key: [KEY_LEN]u8 = undefined;
|
var context_key: [KEY_LEN]u8 = undefined;
|
||||||
context_hasher.final(context_key[0..]);
|
context_hasher.final(context_key[0..]);
|
||||||
var context_key_words: [8]u32 = undefined;
|
const context_key_words = wordsFromLittleEndianBytes(8, context_key);
|
||||||
wordsFromLittleEndianBytes(context_key_words[0..], context_key[0..]);
|
|
||||||
return Blake3.init_internal(context_key_words, DERIVE_KEY_MATERIAL);
|
return Blake3.init_internal(context_key_words, DERIVE_KEY_MATERIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user