std: Make the CRC32 calculation slightly faster

Speed up a little the slicing-by-8 code path by replacing the
(load+shift+xor)*4 sequence with a single u32 load plus a xor.

Before:

```
iterative:  1018 MiB/s [000000006c3b110d]
small keys:  1075 MiB/s [0035bf3dcac00000]
```

After:

```
iterative:  1114 MiB/s [000000006c3b110d]
small keys:  1324 MiB/s [0035bf3dcac00000]
```
This commit is contained in:
LemonBoy 2020-09-13 21:12:21 +02:00 committed by Andrew Kelley
parent 5e50d145d9
commit 61e9e82bdc

View File

@ -71,10 +71,7 @@ pub fn Crc32WithPoly(comptime poly: Polynomial) type {
const p = input[i .. i + 8];
// Unrolling this way gives ~50Mb/s increase
self.crc ^= (@as(u32, p[0]) << 0);
self.crc ^= (@as(u32, p[1]) << 8);
self.crc ^= (@as(u32, p[2]) << 16);
self.crc ^= (@as(u32, p[3]) << 24);
self.crc ^= std.mem.readIntLittle(u32, p[0..4]);
self.crc =
lookup_tables[0][p[7]] ^