Make poly1305 faster

This commit is contained in:
Frank Denis 2020-08-21 13:56:11 +02:00
parent 243b5c7a88
commit c9218f1719

View File

@ -3,224 +3,193 @@
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
// Translated from monocypher which is licensed under CC-0/BSD-3.
//
// https://monocypher.org/
const std = @import("../std.zig");
const builtin = std.builtin;
const Endian = builtin.Endian;
const readIntLittle = std.mem.readIntLittle;
const writeIntLittle = std.mem.writeIntLittle;
const std = @import("std");
const mem = std.mem;
pub const Poly1305 = struct {
const Self = @This();
pub const block_size: usize = 16;
pub const mac_length = 16;
pub const minimum_key_length = 32;
// constant multiplier (from the secret key)
r: [4]u32,
r: [3]u64,
// accumulated hash
h: [5]u32,
// chunk of the message
c: [5]u32,
h: [3]u64 = [_]u64{ 0, 0, 0 },
// random number added at the end (from the secret key)
pad: [4]u32,
// How many bytes are there in the chunk.
c_idx: usize,
pad: [2]u64,
// how many bytes are waiting to be processed in a partial block
leftover: usize = 0,
// partial block buffer
buf: [block_size]u8 align(16) = undefined,
fn secureZero(self: *Self) void {
std.mem.secureZero(u8, @ptrCast([*]u8, self)[0..@sizeOf(Poly1305)]);
pub fn init(key: []const u8) Poly1305 {
std.debug.assert(key.len >= minimum_key_length);
const t0 = mem.readIntLittle(u64, key[0..8]);
const t1 = mem.readIntLittle(u64, key[8..16]);
return Poly1305{
.r = [_]u64{
t0 & 0xffc0fffffff,
((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff,
((t1 >> 24)) & 0x00ffffffc0f,
},
.pad = [_]u64{
mem.readIntLittle(u64, key[16..24]),
mem.readIntLittle(u64, key[24..32]),
},
};
}
fn blocks(st: *Poly1305, m: []const u8, last: comptime bool) void {
const hibit: u64 = if (last) 0 else 1 << 40;
const r0 = st.r[0];
const r1 = st.r[1];
const r2 = st.r[2];
const s1 = r1 * (5 << 2);
const s2 = r2 * (5 << 2);
var i: usize = 0;
while (i + block_size <= m.len) : (i += block_size) {
// h += m[i]
const t0 = mem.readIntLittle(u64, m[i..][0..8]);
const t1 = mem.readIntLittle(u64, m[i + 8 ..][0..8]);
st.h[0] += t0 & 0xfffffffffff;
st.h[1] += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
st.h[2] += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
// h *= r
const d0 = @as(u128, st.h[0]) * @as(u128, r0) + @as(u128, st.h[1]) * @as(u128, s2) + @as(u128, st.h[2]) * @as(u128, s1);
var d1 = @as(u128, st.h[0]) * @as(u128, r1) + @as(u128, st.h[1]) * @as(u128, r0) + @as(u128, st.h[2]) * @as(u128, s2);
var d2 = @as(u128, st.h[0]) * @as(u128, r2) + @as(u128, st.h[1]) * @as(u128, r1) + @as(u128, st.h[2]) * @as(u128, r0);
// partial reduction
var carry = d0 >> 44;
st.h[0] = @truncate(u64, d0) & 0xfffffffffff;
d1 += carry;
carry = @intCast(u64, d1 >> 44);
st.h[1] = @truncate(u64, d1) & 0xfffffffffff;
d2 += carry;
carry = @intCast(u64, d2 >> 42);
st.h[2] = @truncate(u64, d2) & 0x3ffffffffff;
st.h[0] += @truncate(u64, carry) * 5;
carry = st.h[0] >> 44;
st.h[0] &= 0xfffffffffff;
st.h[1] += @truncate(u64, carry);
}
}
pub fn update(st: *Poly1305, m: []const u8) void {
var mb = m;
// handle leftover
if (st.leftover > 0) {
const want = std.math.min(block_size - st.leftover, mb.len);
const mc = mb[0..want];
for (mc) |x, i| {
st.buf[st.leftover + i] = x;
}
mb = mb[want..];
st.leftover += want;
if (st.leftover > block_size) {
return;
}
st.blocks(&st.buf, false);
st.leftover = 0;
}
// process full blocks
if (mb.len >= block_size) {
const want = mb.len & ~(block_size - 1);
st.blocks(mb[0..want], false);
mb = mb[want..];
}
// store leftover
if (mb.len > 0) {
for (mb) |x, i| {
st.buf[st.leftover + i] = x;
}
st.leftover += mb.len;
}
}
pub fn final(st: *Poly1305, out: []u8) void {
std.debug.assert(out.len >= mac_length);
if (st.leftover > 0) {
var i = st.leftover;
st.buf[i] = 1;
i += 1;
while (i < block_size) : (i += 1) {
st.buf[i] = 0;
}
st.blocks(&st.buf, true);
}
// fully carry h
var carry = st.h[1] >> 44;
st.h[1] &= 0xfffffffffff;
st.h[2] += carry;
carry = st.h[2] >> 42;
st.h[2] &= 0x3ffffffffff;
st.h[0] += carry * 5;
carry = st.h[0] >> 44;
st.h[0] &= 0xfffffffffff;
st.h[1] += carry;
carry = st.h[1] >> 44;
st.h[1] &= 0xfffffffffff;
st.h[2] += carry;
carry = st.h[2] >> 42;
st.h[2] &= 0x3ffffffffff;
st.h[0] += carry * 5;
carry = st.h[0] >> 44;
st.h[0] &= 0xfffffffffff;
st.h[1] += carry;
// compute h + -p
var g0 = st.h[0] + 5;
carry = g0 >> 44;
g0 &= 0xfffffffffff;
var g1 = st.h[1] + carry;
carry = g1 >> 44;
g1 &= 0xfffffffffff;
var g2 = st.h[2] + carry -% (1 << 42);
// (hopefully) constant-time select h if h < p, or h + -p if h >= p
const mask = (g2 >> 63) -% 1;
g0 &= mask;
g1 &= mask;
g2 &= mask;
const nmask = ~mask;
st.h[0] = (st.h[0] & nmask) | g0;
st.h[1] = (st.h[1] & nmask) | g1;
st.h[2] = (st.h[2] & nmask) | g2;
// h = (h + pad)
const t0 = st.pad[0];
const t1 = st.pad[1];
st.h[0] += (t0 & 0xfffffffffff);
carry = (st.h[0] >> 44);
st.h[0] &= 0xfffffffffff;
st.h[1] += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + carry;
carry = (st.h[1] >> 44);
st.h[1] &= 0xfffffffffff;
st.h[2] += (((t1 >> 24)) & 0x3ffffffffff) + carry;
st.h[2] &= 0x3ffffffffff;
// mac = h % (2^128)
st.h[0] |= st.h[1] << 44;
st.h[1] = (st.h[1] >> 20) | (st.h[2] << 24);
mem.writeIntLittle(u64, out[0..8], st.h[0]);
mem.writeIntLittle(u64, out[8..16], st.h[1]);
mem.secureZero(u64, &st.r);
}
pub fn create(out: []u8, msg: []const u8, key: []const u8) void {
std.debug.assert(out.len >= mac_length);
std.debug.assert(key.len >= minimum_key_length);
var ctx = Poly1305.init(key);
ctx.update(msg);
ctx.final(out);
}
// Initialize the MAC context.
// - key.len is sufficient size.
pub fn init(key: []const u8) Self {
var ctx: Poly1305 = undefined;
// Initial hash is zero
{
var i: usize = 0;
while (i < 5) : (i += 1) {
ctx.h[i] = 0;
}
}
// add 2^130 to every input block
ctx.c[4] = 1;
polyClearC(&ctx);
// load r and pad (r has some of its bits cleared)
{
var i: usize = 0;
while (i < 1) : (i += 1) {
ctx.r[0] = readIntLittle(u32, key[0..4]) & 0x0fffffff;
}
}
{
var i: usize = 1;
while (i < 4) : (i += 1) {
ctx.r[i] = readIntLittle(u32, key[i * 4 ..][0..4]) & 0x0ffffffc;
}
}
{
var i: usize = 0;
while (i < 4) : (i += 1) {
ctx.pad[i] = readIntLittle(u32, key[i * 4 + 16 ..][0..4]);
}
}
return ctx;
}
// h = (h + c) * r
// preconditions:
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
// ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
// ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff
// Postcondition:
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
fn polyBlock(ctx: *Self) void {
// s = h + c, without carry propagation
const s0 = @as(u64, ctx.h[0]) + ctx.c[0]; // s0 <= 1_fffffffe
const s1 = @as(u64, ctx.h[1]) + ctx.c[1]; // s1 <= 1_fffffffe
const s2 = @as(u64, ctx.h[2]) + ctx.c[2]; // s2 <= 1_fffffffe
const s3 = @as(u64, ctx.h[3]) + ctx.c[3]; // s3 <= 1_fffffffe
const s4 = @as(u64, ctx.h[4]) + ctx.c[4]; // s4 <= 5
// Local all the things!
const r0 = ctx.r[0]; // r0 <= 0fffffff
const r1 = ctx.r[1]; // r1 <= 0ffffffc
const r2 = ctx.r[2]; // r2 <= 0ffffffc
const r3 = ctx.r[3]; // r3 <= 0ffffffc
const rr0 = (r0 >> 2) * 5; // rr0 <= 13fffffb // lose 2 bits...
const rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
const rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
const rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
// (h + c) * r, without carry propagation
const x0 = s0 * r0 + s1 * rr3 + s2 * rr2 + s3 * rr1 + s4 * rr0; //<=97ffffe007fffff8
const x1 = s0 * r1 + s1 * r0 + s2 * rr3 + s3 * rr2 + s4 * rr1; //<=8fffffe20ffffff6
const x2 = s0 * r2 + s1 * r1 + s2 * r0 + s3 * rr3 + s4 * rr2; //<=87ffffe417fffff4
const x3 = s0 * r3 + s1 * r2 + s2 * r1 + s3 * r0 + s4 * rr3; //<=7fffffe61ffffff2
const x4 = s4 * (r0 & 3); // ...recover 2 bits //<= f
// partial reduction modulo 2^130 - 5
const _u5 = @truncate(u32, x4 + (x3 >> 32)); // u5 <= 7ffffff5
const _u0 = (_u5 >> 2) * 5 + (x0 & 0xffffffff);
const _u1 = (_u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
const _u2 = (_u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
const _u3 = (_u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
const _u4 = (_u3 >> 32) + (_u5 & 3);
// Update the hash
ctx.h[0] = @truncate(u32, _u0); // u0 <= 1_9ffffff0
ctx.h[1] = @truncate(u32, _u1); // u1 <= 1_97ffffe0
ctx.h[2] = @truncate(u32, _u2); // u2 <= 1_8fffffe2
ctx.h[3] = @truncate(u32, _u3); // u3 <= 1_87ffffe4
ctx.h[4] = @truncate(u32, _u4); // u4 <= 4
}
// (re-)initializes the input counter and input buffer
fn polyClearC(ctx: *Self) void {
ctx.c[0] = 0;
ctx.c[1] = 0;
ctx.c[2] = 0;
ctx.c[3] = 0;
ctx.c_idx = 0;
}
fn polyTakeInput(ctx: *Self, input: u8) void {
const word = ctx.c_idx >> 2;
const byte = ctx.c_idx & 3;
ctx.c[word] |= std.math.shl(u32, input, byte * 8);
ctx.c_idx += 1;
}
fn polyUpdate(ctx: *Self, msg: []const u8) void {
for (msg) |b| {
polyTakeInput(ctx, b);
if (ctx.c_idx == 16) {
polyBlock(ctx);
polyClearC(ctx);
}
}
}
fn alignTo(x: usize, block_size: usize) usize {
return ((~x) +% 1) & (block_size - 1);
}
// Feed data into the MAC context.
pub fn update(ctx: *Self, msg: []const u8) void {
// Align ourselves with block boundaries
const alignm = std.math.min(alignTo(ctx.c_idx, 16), msg.len);
polyUpdate(ctx, msg[0..alignm]);
var nmsg = msg[alignm..];
// Process the msg block by block
const nb_blocks = nmsg.len >> 4;
var i: usize = 0;
while (i < nb_blocks) : (i += 1) {
ctx.c[0] = readIntLittle(u32, nmsg[0..4]);
ctx.c[1] = readIntLittle(u32, nmsg[4..8]);
ctx.c[2] = readIntLittle(u32, nmsg[8..12]);
ctx.c[3] = readIntLittle(u32, nmsg[12..16]);
polyBlock(ctx);
nmsg = nmsg[16..];
}
if (nb_blocks > 0) {
polyClearC(ctx);
}
// remaining bytes
polyUpdate(ctx, nmsg[0..]);
}
// Finalize the MAC and output into buffer provided by caller.
pub fn final(ctx: *Self, out: []u8) void {
// Process the last block (if any)
if (ctx.c_idx != 0) {
// move the final 1 according to remaining input length
// (We may add less than 2^130 to the last input block)
ctx.c[4] = 0;
polyTakeInput(ctx, 1);
// one last hash update
polyBlock(ctx);
}
// check if we should subtract 2^130-5 by performing the
// corresponding carry propagation.
const _u0 = @as(u64, 5) + ctx.h[0]; // <= 1_00000004
const _u1 = (_u0 >> 32) + ctx.h[1]; // <= 1_00000000
const _u2 = (_u1 >> 32) + ctx.h[2]; // <= 1_00000000
const _u3 = (_u2 >> 32) + ctx.h[3]; // <= 1_00000000
const _u4 = (_u3 >> 32) + ctx.h[4]; // <= 5
// u4 indicates how many times we should subtract 2^130-5 (0 or 1)
// h + pad, minus 2^130-5 if u4 exceeds 3
const uu0 = (_u4 >> 2) * 5 + ctx.h[0] + ctx.pad[0]; // <= 2_00000003
const uu1 = (uu0 >> 32) + ctx.h[1] + ctx.pad[1]; // <= 2_00000000
const uu2 = (uu1 >> 32) + ctx.h[2] + ctx.pad[2]; // <= 2_00000000
const uu3 = (uu2 >> 32) + ctx.h[3] + ctx.pad[3]; // <= 2_00000000
writeIntLittle(u32, out[0..4], @truncate(u32, uu0));
writeIntLittle(u32, out[4..8], @truncate(u32, uu1));
writeIntLittle(u32, out[8..12], @truncate(u32, uu2));
writeIntLittle(u32, out[12..16], @truncate(u32, uu3));
ctx.secureZero();
var st = Poly1305.init(key);
st.update(msg);
st.final(out);
}
};