Merge pull request #14394 from dweiller/zstandard

Zstandard decompressor
This commit is contained in:
Andrew Kelley 2023-02-21 13:59:14 -05:00 committed by GitHub
commit b52be973df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 6383 additions and 0 deletions

View File

@ -113,8 +113,11 @@ pub fn build(b: *std.Build) !void {
".gz",
".z.0",
".z.9",
".zstd.3",
".zstd.19",
"rfc1951.txt",
"rfc1952.txt",
"rfc8478.txt",
// exclude files from lib/std/compress/deflate/testdata
".expect",
".expect-noinput",

136
lib/std/RingBuffer.zig Normal file
View File

@ -0,0 +1,136 @@
//! This ring buffer stores read and write indices while being able to utilise
//! the full backing slice by incrementing the indices modulo twice the slice's
//! length and reducing indices modulo the slice's length on slice access. This
//! means that whether the ring buffer if full or empty can be distinguished by
//! looking at the difference between the read and write indices without adding
//! an extra boolean flag or having to reserve a slot in the buffer.
//!
//! This ring buffer has not been implemented with thread safety in mind, and
//! therefore should not be assumed to be suitable for use cases involving
//! separate reader and writer threads.
const Allocator = @import("std").mem.Allocator;
const assert = @import("std").debug.assert;
const RingBuffer = @This();
data: []u8,
read_index: usize,
write_index: usize,
pub const Error = error{Full};
/// Allocate a new `RingBuffer`; `deinit()` should be called to free the buffer.
pub fn init(allocator: Allocator, capacity: usize) Allocator.Error!RingBuffer {
const bytes = try allocator.alloc(u8, capacity);
return RingBuffer{
.data = bytes,
.write_index = 0,
.read_index = 0,
};
}
/// Free the data backing a `RingBuffer`; must be passed the same `Allocator` as
/// `init()`.
pub fn deinit(self: *RingBuffer, allocator: Allocator) void {
allocator.free(self.data);
self.* = undefined;
}
/// Returns `index` modulo the length of the backing slice.
pub fn mask(self: RingBuffer, index: usize) usize {
return index % self.data.len;
}
/// Returns `index` modulo twice the length of the backing slice.
pub fn mask2(self: RingBuffer, index: usize) usize {
return index % (2 * self.data.len);
}
/// Write `byte` into the ring buffer. Returns `error.Full` if the ring
/// buffer is full.
pub fn write(self: *RingBuffer, byte: u8) Error!void {
if (self.isFull()) return error.Full;
self.writeAssumeCapacity(byte);
}
/// Write `byte` into the ring buffer. If the ring buffer is full, the
/// oldest byte is overwritten.
pub fn writeAssumeCapacity(self: *RingBuffer, byte: u8) void {
self.data[self.mask(self.write_index)] = byte;
self.write_index = self.mask2(self.write_index + 1);
}
/// Write `bytes` into the ring buffer. Returns `error.Full` if the ring
/// buffer does not have enough space, without writing any data.
pub fn writeSlice(self: *RingBuffer, bytes: []const u8) Error!void {
if (self.len() + bytes.len > self.data.len) return error.Full;
self.writeSliceAssumeCapacity(bytes);
}
/// Write `bytes` into the ring buffer. If there is not enough space, older
/// bytes will be overwritten.
pub fn writeSliceAssumeCapacity(self: *RingBuffer, bytes: []const u8) void {
for (bytes) |b| self.writeAssumeCapacity(b);
}
/// Consume a byte from the ring buffer and return it. Returns `null` if the
/// ring buffer is empty.
pub fn read(self: *RingBuffer) ?u8 {
if (self.isEmpty()) return null;
return self.readAssumeLength();
}
/// Consume a byte from the ring buffer and return it; asserts that the buffer
/// is not empty.
pub fn readAssumeLength(self: *RingBuffer) u8 {
assert(!self.isEmpty());
const byte = self.data[self.mask(self.read_index)];
self.read_index = self.mask2(self.read_index + 1);
return byte;
}
/// Returns `true` if the ring buffer is empty and `false` otherwise.
pub fn isEmpty(self: RingBuffer) bool {
return self.write_index == self.read_index;
}
/// Returns `true` if the ring buffer is full and `false` otherwise.
pub fn isFull(self: RingBuffer) bool {
return self.mask2(self.write_index + self.data.len) == self.read_index;
}
/// Returns the length
pub fn len(self: RingBuffer) usize {
const wrap_offset = 2 * self.data.len * @boolToInt(self.write_index < self.read_index);
const adjusted_write_index = self.write_index + wrap_offset;
return adjusted_write_index - self.read_index;
}
/// A `Slice` represents a region of a ring buffer. The region is split into two
/// sections as the ring buffer data will not be contiguous if the desired
/// region wraps to the start of the backing slice.
pub const Slice = struct {
first: []u8,
second: []u8,
};
/// Returns a `Slice` for the region of the ring buffer starting at
/// `self.mask(start_unmasked)` with the specified length.
pub fn sliceAt(self: RingBuffer, start_unmasked: usize, length: usize) Slice {
assert(length <= self.data.len);
const slice1_start = self.mask(start_unmasked);
const slice1_end = @min(self.data.len, slice1_start + length);
const slice1 = self.data[slice1_start..slice1_end];
const slice2 = self.data[0 .. length - slice1.len];
return Slice{
.first = slice1,
.second = slice2,
};
}
/// Returns a `Slice` for the last `length` bytes written to the ring buffer.
/// Does not check that any bytes have been written into the region.
pub fn sliceLast(self: RingBuffer, length: usize) Slice {
return self.sliceAt(self.write_index + self.data.len - length, length);
}

View File

@ -6,6 +6,7 @@ pub const lzma = @import("compress/lzma.zig");
pub const lzma2 = @import("compress/lzma2.zig");
pub const xz = @import("compress/xz.zig");
pub const zlib = @import("compress/zlib.zig");
pub const zstd = @import("compress/zstandard.zig");
pub fn HashedReader(
comptime ReaderType: anytype,
@ -44,4 +45,5 @@ test {
_ = lzma2;
_ = xz;
_ = zlib;
_ = zstd;
}

3027
lib/std/compress/testdata/rfc8478.txt vendored Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,286 @@
const std = @import("std");
const Allocator = std.mem.Allocator;
const RingBuffer = std.RingBuffer;
const types = @import("zstandard/types.zig");
pub const frame = types.frame;
pub const compressed_block = types.compressed_block;
pub const decompress = @import("zstandard/decompress.zig");
pub const DecompressStreamOptions = struct {
verify_checksum: bool = true,
window_size_max: usize = 1 << 23, // 8MiB default maximum window size,
};
pub fn DecompressStream(
comptime ReaderType: type,
comptime options: DecompressStreamOptions,
) type {
return struct {
const Self = @This();
allocator: Allocator,
source: std.io.CountingReader(ReaderType),
state: enum { NewFrame, InFrame, LastBlock },
decode_state: decompress.block.DecodeState,
frame_context: decompress.FrameContext,
buffer: RingBuffer,
literal_fse_buffer: []types.compressed_block.Table.Fse,
match_fse_buffer: []types.compressed_block.Table.Fse,
offset_fse_buffer: []types.compressed_block.Table.Fse,
literals_buffer: []u8,
sequence_buffer: []u8,
checksum: if (options.verify_checksum) ?u32 else void,
current_frame_decompressed_size: usize,
pub const Error = ReaderType.Error || error{
ChecksumFailure,
DictionaryIdFlagUnsupported,
MalformedBlock,
MalformedFrame,
OutOfMemory,
};
pub const Reader = std.io.Reader(*Self, Error, read);
pub fn init(allocator: Allocator, source: ReaderType) Self {
return Self{
.allocator = allocator,
.source = std.io.countingReader(source),
.state = .NewFrame,
.decode_state = undefined,
.frame_context = undefined,
.buffer = undefined,
.literal_fse_buffer = undefined,
.match_fse_buffer = undefined,
.offset_fse_buffer = undefined,
.literals_buffer = undefined,
.sequence_buffer = undefined,
.checksum = undefined,
.current_frame_decompressed_size = undefined,
};
}
fn frameInit(self: *Self) !void {
const source_reader = self.source.reader();
switch (try decompress.decodeFrameHeader(source_reader)) {
.skippable => |header| {
try source_reader.skipBytes(header.frame_size, .{});
self.state = .NewFrame;
},
.zstandard => |header| {
const frame_context = context: {
break :context try decompress.FrameContext.init(
header,
options.window_size_max,
options.verify_checksum,
);
};
const literal_fse_buffer = try self.allocator.alloc(
types.compressed_block.Table.Fse,
types.compressed_block.table_size_max.literal,
);
errdefer self.allocator.free(literal_fse_buffer);
const match_fse_buffer = try self.allocator.alloc(
types.compressed_block.Table.Fse,
types.compressed_block.table_size_max.match,
);
errdefer self.allocator.free(match_fse_buffer);
const offset_fse_buffer = try self.allocator.alloc(
types.compressed_block.Table.Fse,
types.compressed_block.table_size_max.offset,
);
errdefer self.allocator.free(offset_fse_buffer);
const decode_state = decompress.block.DecodeState.init(
literal_fse_buffer,
match_fse_buffer,
offset_fse_buffer,
);
const buffer = try RingBuffer.init(self.allocator, frame_context.window_size);
const literals_data = try self.allocator.alloc(u8, options.window_size_max);
errdefer self.allocator.free(literals_data);
const sequence_data = try self.allocator.alloc(u8, options.window_size_max);
errdefer self.allocator.free(sequence_data);
self.literal_fse_buffer = literal_fse_buffer;
self.match_fse_buffer = match_fse_buffer;
self.offset_fse_buffer = offset_fse_buffer;
self.literals_buffer = literals_data;
self.sequence_buffer = sequence_data;
self.buffer = buffer;
self.decode_state = decode_state;
self.frame_context = frame_context;
self.checksum = if (options.verify_checksum) null else {};
self.current_frame_decompressed_size = 0;
self.state = .InFrame;
},
}
}
pub fn deinit(self: *Self) void {
if (self.state == .NewFrame) return;
self.allocator.free(self.decode_state.literal_fse_buffer);
self.allocator.free(self.decode_state.match_fse_buffer);
self.allocator.free(self.decode_state.offset_fse_buffer);
self.allocator.free(self.literals_buffer);
self.allocator.free(self.sequence_buffer);
self.buffer.deinit(self.allocator);
}
pub fn reader(self: *Self) Reader {
return .{ .context = self };
}
pub fn read(self: *Self, buffer: []u8) Error!usize {
if (buffer.len == 0) return 0;
var size: usize = 0;
while (size == 0) {
while (self.state == .NewFrame) {
const initial_count = self.source.bytes_read;
self.frameInit() catch |err| switch (err) {
error.DictionaryIdFlagUnsupported => return error.DictionaryIdFlagUnsupported,
error.EndOfStream => return if (self.source.bytes_read == initial_count)
0
else
error.MalformedFrame,
error.OutOfMemory => return error.OutOfMemory,
else => return error.MalformedFrame,
};
}
size = try self.readInner(buffer);
}
return size;
}
fn readInner(self: *Self, buffer: []u8) Error!usize {
std.debug.assert(self.state != .NewFrame);
const source_reader = self.source.reader();
while (self.buffer.isEmpty() and self.state != .LastBlock) {
const header_bytes = source_reader.readBytesNoEof(3) catch
return error.MalformedFrame;
const block_header = decompress.block.decodeBlockHeader(&header_bytes);
decompress.block.decodeBlockReader(
&self.buffer,
source_reader,
block_header,
&self.decode_state,
self.frame_context.block_size_max,
self.literals_buffer,
self.sequence_buffer,
) catch
return error.MalformedBlock;
if (self.frame_context.content_size) |size| {
if (self.current_frame_decompressed_size > size) return error.MalformedFrame;
}
const size = self.buffer.len();
self.current_frame_decompressed_size += size;
if (self.frame_context.hasher_opt) |*hasher| {
if (size > 0) {
const written_slice = self.buffer.sliceLast(size);
hasher.update(written_slice.first);
hasher.update(written_slice.second);
}
}
if (block_header.last_block) {
self.state = .LastBlock;
if (self.frame_context.has_checksum) {
const checksum = source_reader.readIntLittle(u32) catch
return error.MalformedFrame;
if (comptime options.verify_checksum) {
if (self.frame_context.hasher_opt) |*hasher| {
if (checksum != decompress.computeChecksum(hasher))
return error.ChecksumFailure;
}
}
}
if (self.frame_context.content_size) |content_size| {
if (content_size != self.current_frame_decompressed_size) {
return error.MalformedFrame;
}
}
}
}
const size = @min(self.buffer.len(), buffer.len);
for (0..size) |i| {
buffer[i] = self.buffer.read().?;
}
if (self.state == .LastBlock and self.buffer.len() == 0) {
self.state = .NewFrame;
self.allocator.free(self.literal_fse_buffer);
self.allocator.free(self.match_fse_buffer);
self.allocator.free(self.offset_fse_buffer);
self.allocator.free(self.literals_buffer);
self.allocator.free(self.sequence_buffer);
self.buffer.deinit(self.allocator);
}
return size;
}
};
}
pub fn decompressStreamOptions(
allocator: Allocator,
reader: anytype,
comptime options: DecompressStreamOptions,
) DecompressStream(@TypeOf(reader, options)) {
return DecompressStream(@TypeOf(reader), options).init(allocator, reader);
}
pub fn decompressStream(
allocator: Allocator,
reader: anytype,
) DecompressStream(@TypeOf(reader), .{}) {
return DecompressStream(@TypeOf(reader), .{}).init(allocator, reader);
}
fn testDecompress(data: []const u8) ![]u8 {
var in_stream = std.io.fixedBufferStream(data);
var zstd_stream = decompressStream(std.testing.allocator, in_stream.reader());
defer zstd_stream.deinit();
const result = zstd_stream.reader().readAllAlloc(std.testing.allocator, std.math.maxInt(usize));
return result;
}
fn testReader(data: []const u8, comptime expected: []const u8) !void {
const buf = try testDecompress(data);
defer std.testing.allocator.free(buf);
try std.testing.expectEqualSlices(u8, expected, buf);
}
test "zstandard decompression" {
const uncompressed = @embedFile("testdata/rfc8478.txt");
const compressed3 = @embedFile("testdata/rfc8478.txt.zst.3");
const compressed19 = @embedFile("testdata/rfc8478.txt.zst.19");
var buffer = try std.testing.allocator.alloc(u8, uncompressed.len);
defer std.testing.allocator.free(buffer);
const res3 = try decompress.decode(buffer, compressed3, true);
try std.testing.expectEqual(uncompressed.len, res3);
try std.testing.expectEqualSlices(u8, uncompressed, buffer);
const res19 = try decompress.decode(buffer, compressed19, true);
try std.testing.expectEqual(uncompressed.len, res19);
try std.testing.expectEqualSlices(u8, uncompressed, buffer);
try testReader(compressed3, uncompressed);
try testReader(compressed19, uncompressed);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,153 @@
const std = @import("std");
const assert = std.debug.assert;
const types = @import("../types.zig");
const Table = types.compressed_block.Table;
pub fn decodeFseTable(
bit_reader: anytype,
expected_symbol_count: usize,
max_accuracy_log: u4,
entries: []Table.Fse,
) !usize {
const accuracy_log_biased = try bit_reader.readBitsNoEof(u4, 4);
if (accuracy_log_biased > max_accuracy_log -| 5) return error.MalformedAccuracyLog;
const accuracy_log = accuracy_log_biased + 5;
var values: [256]u16 = undefined;
var value_count: usize = 0;
const total_probability = @as(u16, 1) << accuracy_log;
var accumulated_probability: u16 = 0;
while (accumulated_probability < total_probability) {
// WARNING: The RFC in poorly worded, and would suggest std.math.log2_int_ceil is correct here,
// but power of two (remaining probabilities + 1) need max bits set to 1 more.
const max_bits = std.math.log2_int(u16, total_probability - accumulated_probability + 1) + 1;
const small = try bit_reader.readBitsNoEof(u16, max_bits - 1);
const cutoff = (@as(u16, 1) << max_bits) - 1 - (total_probability - accumulated_probability + 1);
const value = if (small < cutoff)
small
else value: {
const value_read = small + (try bit_reader.readBitsNoEof(u16, 1) << (max_bits - 1));
break :value if (value_read < @as(u16, 1) << (max_bits - 1))
value_read
else
value_read - cutoff;
};
accumulated_probability += if (value != 0) value - 1 else 1;
values[value_count] = value;
value_count += 1;
if (value == 1) {
while (true) {
const repeat_flag = try bit_reader.readBitsNoEof(u2, 2);
if (repeat_flag + value_count > 256) return error.MalformedFseTable;
for (0..repeat_flag) |_| {
values[value_count] = 1;
value_count += 1;
}
if (repeat_flag < 3) break;
}
}
if (value_count == 256) break;
}
bit_reader.alignToByte();
if (value_count < 2) return error.MalformedFseTable;
if (accumulated_probability != total_probability) return error.MalformedFseTable;
if (value_count > expected_symbol_count) return error.MalformedFseTable;
const table_size = total_probability;
try buildFseTable(values[0..value_count], entries[0..table_size]);
return table_size;
}
fn buildFseTable(values: []const u16, entries: []Table.Fse) !void {
const total_probability = @intCast(u16, entries.len);
const accuracy_log = std.math.log2_int(u16, total_probability);
assert(total_probability <= 1 << 9);
var less_than_one_count: usize = 0;
for (values, 0..) |value, i| {
if (value == 0) {
entries[entries.len - 1 - less_than_one_count] = Table.Fse{
.symbol = @intCast(u8, i),
.baseline = 0,
.bits = accuracy_log,
};
less_than_one_count += 1;
}
}
var position: usize = 0;
var temp_states: [1 << 9]u16 = undefined;
for (values, 0..) |value, symbol| {
if (value == 0 or value == 1) continue;
const probability = value - 1;
const state_share_dividend = std.math.ceilPowerOfTwo(u16, probability) catch
return error.MalformedFseTable;
const share_size = @divExact(total_probability, state_share_dividend);
const double_state_count = state_share_dividend - probability;
const single_state_count = probability - double_state_count;
const share_size_log = std.math.log2_int(u16, share_size);
for (0..probability) |i| {
temp_states[i] = @intCast(u16, position);
position += (entries.len >> 1) + (entries.len >> 3) + 3;
position &= entries.len - 1;
while (position >= entries.len - less_than_one_count) {
position += (entries.len >> 1) + (entries.len >> 3) + 3;
position &= entries.len - 1;
}
}
std.sort.sort(u16, temp_states[0..probability], {}, std.sort.asc(u16));
for (0..probability) |i| {
entries[temp_states[i]] = if (i < double_state_count) Table.Fse{
.symbol = @intCast(u8, symbol),
.bits = share_size_log + 1,
.baseline = single_state_count * share_size + @intCast(u16, i) * 2 * share_size,
} else Table.Fse{
.symbol = @intCast(u8, symbol),
.bits = share_size_log,
.baseline = (@intCast(u16, i) - double_state_count) * share_size,
};
}
}
}
test buildFseTable {
const literals_length_default_values = [36]u16{
5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 2, 2, 2, 2, 2,
0, 0, 0, 0,
};
const match_lengths_default_values = [53]u16{
2, 5, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0,
0, 0, 0, 0, 0,
};
const offset_codes_default_values = [29]u16{
2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0,
};
var entries: [64]Table.Fse = undefined;
try buildFseTable(&literals_length_default_values, &entries);
try std.testing.expectEqualSlices(Table.Fse, types.compressed_block.predefined_literal_fse_table.fse, &entries);
try buildFseTable(&match_lengths_default_values, &entries);
try std.testing.expectEqualSlices(Table.Fse, types.compressed_block.predefined_match_fse_table.fse, &entries);
try buildFseTable(&offset_codes_default_values, entries[0..32]);
try std.testing.expectEqualSlices(Table.Fse, types.compressed_block.predefined_offset_fse_table.fse, entries[0..32]);
}

View File

@ -0,0 +1,234 @@
const std = @import("std");
const types = @import("../types.zig");
const LiteralsSection = types.compressed_block.LiteralsSection;
const Table = types.compressed_block.Table;
const readers = @import("../readers.zig");
const decodeFseTable = @import("fse.zig").decodeFseTable;
pub const Error = error{
MalformedHuffmanTree,
MalformedFseTable,
MalformedAccuracyLog,
EndOfStream,
};
fn decodeFseHuffmanTree(
source: anytype,
compressed_size: usize,
buffer: []u8,
weights: *[256]u4,
) !usize {
var stream = std.io.limitedReader(source, compressed_size);
var bit_reader = readers.bitReader(stream.reader());
var entries: [1 << 6]Table.Fse = undefined;
const table_size = decodeFseTable(&bit_reader, 256, 6, &entries) catch |err| switch (err) {
error.MalformedAccuracyLog, error.MalformedFseTable => |e| return e,
error.EndOfStream => return error.MalformedFseTable,
else => |e| return e,
};
const accuracy_log = std.math.log2_int_ceil(usize, table_size);
const amount = try stream.reader().readAll(buffer);
var huff_bits: readers.ReverseBitReader = undefined;
huff_bits.init(buffer[0..amount]) catch return error.MalformedHuffmanTree;
return assignWeights(&huff_bits, accuracy_log, &entries, weights);
}
fn decodeFseHuffmanTreeSlice(src: []const u8, compressed_size: usize, weights: *[256]u4) !usize {
if (src.len < compressed_size) return error.MalformedHuffmanTree;
var stream = std.io.fixedBufferStream(src[0..compressed_size]);
var counting_reader = std.io.countingReader(stream.reader());
var bit_reader = readers.bitReader(counting_reader.reader());
var entries: [1 << 6]Table.Fse = undefined;
const table_size = decodeFseTable(&bit_reader, 256, 6, &entries) catch |err| switch (err) {
error.MalformedAccuracyLog, error.MalformedFseTable => |e| return e,
error.EndOfStream => return error.MalformedFseTable,
};
const accuracy_log = std.math.log2_int_ceil(usize, table_size);
const start_index = std.math.cast(usize, counting_reader.bytes_read) orelse
return error.MalformedHuffmanTree;
var huff_data = src[start_index..compressed_size];
var huff_bits: readers.ReverseBitReader = undefined;
huff_bits.init(huff_data) catch return error.MalformedHuffmanTree;
return assignWeights(&huff_bits, accuracy_log, &entries, weights);
}
fn assignWeights(
huff_bits: *readers.ReverseBitReader,
accuracy_log: usize,
entries: *[1 << 6]Table.Fse,
weights: *[256]u4,
) !usize {
var i: usize = 0;
var even_state: u32 = huff_bits.readBitsNoEof(u32, accuracy_log) catch return error.MalformedHuffmanTree;
var odd_state: u32 = huff_bits.readBitsNoEof(u32, accuracy_log) catch return error.MalformedHuffmanTree;
while (i < 254) {
const even_data = entries[even_state];
var read_bits: usize = 0;
const even_bits = huff_bits.readBits(u32, even_data.bits, &read_bits) catch unreachable;
weights[i] = std.math.cast(u4, even_data.symbol) orelse return error.MalformedHuffmanTree;
i += 1;
if (read_bits < even_data.bits) {
weights[i] = std.math.cast(u4, entries[odd_state].symbol) orelse return error.MalformedHuffmanTree;
i += 1;
break;
}
even_state = even_data.baseline + even_bits;
read_bits = 0;
const odd_data = entries[odd_state];
const odd_bits = huff_bits.readBits(u32, odd_data.bits, &read_bits) catch unreachable;
weights[i] = std.math.cast(u4, odd_data.symbol) orelse return error.MalformedHuffmanTree;
i += 1;
if (read_bits < odd_data.bits) {
if (i == 255) return error.MalformedHuffmanTree;
weights[i] = std.math.cast(u4, entries[even_state].symbol) orelse return error.MalformedHuffmanTree;
i += 1;
break;
}
odd_state = odd_data.baseline + odd_bits;
} else return error.MalformedHuffmanTree;
if (!huff_bits.isEmpty()) {
return error.MalformedHuffmanTree;
}
return i + 1; // stream contains all but the last symbol
}
fn decodeDirectHuffmanTree(source: anytype, encoded_symbol_count: usize, weights: *[256]u4) !usize {
const weights_byte_count = (encoded_symbol_count + 1) / 2;
for (0..weights_byte_count) |i| {
const byte = try source.readByte();
weights[2 * i] = @intCast(u4, byte >> 4);
weights[2 * i + 1] = @intCast(u4, byte & 0xF);
}
return encoded_symbol_count + 1;
}
fn assignSymbols(weight_sorted_prefixed_symbols: []LiteralsSection.HuffmanTree.PrefixedSymbol, weights: [256]u4) usize {
for (0..weight_sorted_prefixed_symbols.len) |i| {
weight_sorted_prefixed_symbols[i] = .{
.symbol = @intCast(u8, i),
.weight = undefined,
.prefix = undefined,
};
}
std.sort.sort(
LiteralsSection.HuffmanTree.PrefixedSymbol,
weight_sorted_prefixed_symbols,
weights,
lessThanByWeight,
);
var prefix: u16 = 0;
var prefixed_symbol_count: usize = 0;
var sorted_index: usize = 0;
const symbol_count = weight_sorted_prefixed_symbols.len;
while (sorted_index < symbol_count) {
var symbol = weight_sorted_prefixed_symbols[sorted_index].symbol;
const weight = weights[symbol];
if (weight == 0) {
sorted_index += 1;
continue;
}
while (sorted_index < symbol_count) : ({
sorted_index += 1;
prefixed_symbol_count += 1;
prefix += 1;
}) {
symbol = weight_sorted_prefixed_symbols[sorted_index].symbol;
if (weights[symbol] != weight) {
prefix = ((prefix - 1) >> (weights[symbol] - weight)) + 1;
break;
}
weight_sorted_prefixed_symbols[prefixed_symbol_count].symbol = symbol;
weight_sorted_prefixed_symbols[prefixed_symbol_count].prefix = prefix;
weight_sorted_prefixed_symbols[prefixed_symbol_count].weight = weight;
}
}
return prefixed_symbol_count;
}
fn buildHuffmanTree(weights: *[256]u4, symbol_count: usize) error{MalformedHuffmanTree}!LiteralsSection.HuffmanTree {
var weight_power_sum_big: u32 = 0;
for (weights[0 .. symbol_count - 1]) |value| {
weight_power_sum_big += (@as(u16, 1) << value) >> 1;
}
if (weight_power_sum_big >= 1 << 11) return error.MalformedHuffmanTree;
const weight_power_sum = @intCast(u16, weight_power_sum_big);
// advance to next power of two (even if weight_power_sum is a power of 2)
// TODO: is it valid to have weight_power_sum == 0?
const max_number_of_bits = if (weight_power_sum == 0) 1 else std.math.log2_int(u16, weight_power_sum) + 1;
const next_power_of_two = @as(u16, 1) << max_number_of_bits;
weights[symbol_count - 1] = std.math.log2_int(u16, next_power_of_two - weight_power_sum) + 1;
var weight_sorted_prefixed_symbols: [256]LiteralsSection.HuffmanTree.PrefixedSymbol = undefined;
const prefixed_symbol_count = assignSymbols(weight_sorted_prefixed_symbols[0..symbol_count], weights.*);
const tree = LiteralsSection.HuffmanTree{
.max_bit_count = max_number_of_bits,
.symbol_count_minus_one = @intCast(u8, prefixed_symbol_count - 1),
.nodes = weight_sorted_prefixed_symbols,
};
return tree;
}
pub fn decodeHuffmanTree(
source: anytype,
buffer: []u8,
) (@TypeOf(source).Error || Error)!LiteralsSection.HuffmanTree {
const header = try source.readByte();
var weights: [256]u4 = undefined;
const symbol_count = if (header < 128)
// FSE compressed weights
try decodeFseHuffmanTree(source, header, buffer, &weights)
else
try decodeDirectHuffmanTree(source, header - 127, &weights);
return buildHuffmanTree(&weights, symbol_count);
}
pub fn decodeHuffmanTreeSlice(
src: []const u8,
consumed_count: *usize,
) Error!LiteralsSection.HuffmanTree {
if (src.len == 0) return error.MalformedHuffmanTree;
const header = src[0];
var bytes_read: usize = 1;
var weights: [256]u4 = undefined;
const symbol_count = if (header < 128) count: {
// FSE compressed weights
bytes_read += header;
break :count try decodeFseHuffmanTreeSlice(src[1..], header, &weights);
} else count: {
var fbs = std.io.fixedBufferStream(src[1..]);
defer bytes_read += fbs.pos;
break :count try decodeDirectHuffmanTree(fbs.reader(), header - 127, &weights);
};
consumed_count.* += bytes_read;
return buildHuffmanTree(&weights, symbol_count);
}
fn lessThanByWeight(
weights: [256]u4,
lhs: LiteralsSection.HuffmanTree.PrefixedSymbol,
rhs: LiteralsSection.HuffmanTree.PrefixedSymbol,
) bool {
// NOTE: this function relies on the use of a stable sorting algorithm,
// otherwise a special case of if (weights[lhs] == weights[rhs]) return lhs < rhs;
// should be added
return weights[lhs.symbol] < weights[rhs.symbol];
}

View File

@ -0,0 +1,636 @@
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const RingBuffer = std.RingBuffer;
const types = @import("types.zig");
const frame = types.frame;
const LiteralsSection = types.compressed_block.LiteralsSection;
const SequencesSection = types.compressed_block.SequencesSection;
const SkippableHeader = types.frame.Skippable.Header;
const ZstandardHeader = types.frame.Zstandard.Header;
const Table = types.compressed_block.Table;
pub const block = @import("decode/block.zig");
const readers = @import("readers.zig");
const readInt = std.mem.readIntLittle;
const readIntSlice = std.mem.readIntSliceLittle;
/// Returns `true` is `magic` is a valid magic number for a skippable frame
pub fn isSkippableMagic(magic: u32) bool {
return frame.Skippable.magic_number_min <= magic and magic <= frame.Skippable.magic_number_max;
}
/// Returns the kind of frame at the beginning of `source`.
///
/// Errors returned:
/// - `error.BadMagic` if `source` begins with bytes not equal to the
/// Zstandard frame magic number, or outside the range of magic numbers for
/// skippable frames.
/// - `error.EndOfStream` if `source` contains fewer than 4 bytes
pub fn decodeFrameType(source: anytype) error{ BadMagic, EndOfStream }!frame.Kind {
const magic = try source.readIntLittle(u32);
return frameType(magic);
}
/// Returns the kind of frame associated to `magic`.
///
/// Errors returned:
/// - `error.BadMagic` if `magic` is not a valid magic number.
pub fn frameType(magic: u32) error{BadMagic}!frame.Kind {
return if (magic == frame.Zstandard.magic_number)
.zstandard
else if (isSkippableMagic(magic))
.skippable
else
error.BadMagic;
}
pub const FrameHeader = union(enum) {
zstandard: ZstandardHeader,
skippable: SkippableHeader,
};
pub const HeaderError = error{ BadMagic, EndOfStream, ReservedBitSet };
/// Returns the header of the frame at the beginning of `source`.
///
/// Errors returned:
/// - `error.BadMagic` if `source` begins with bytes not equal to the
/// Zstandard frame magic number, or outside the range of magic numbers for
/// skippable frames.
/// - `error.EndOfStream` if `source` contains fewer than 4 bytes
/// - `error.ReservedBitSet` if the frame is a Zstandard frame and any of the
/// reserved bits are set
pub fn decodeFrameHeader(source: anytype) (@TypeOf(source).Error || HeaderError)!FrameHeader {
const magic = try source.readIntLittle(u32);
const frame_type = try frameType(magic);
switch (frame_type) {
.zstandard => return FrameHeader{ .zstandard = try decodeZstandardHeader(source) },
.skippable => return FrameHeader{
.skippable = .{
.magic_number = magic,
.frame_size = try source.readIntLittle(u32),
},
},
}
}
pub const ReadWriteCount = struct {
read_count: usize,
write_count: usize,
};
/// Decodes frames from `src` into `dest`; returns the length of the result.
/// The stream should not have extra trailing bytes - either all bytes in `src`
/// will be decoded, or an error will be returned. An error will be returned if
/// a Zstandard frame in `src` does not declare its content size.
///
/// Errors returned:
/// - `error.DictionaryIdFlagUnsupported` if a `src` contains a frame that
/// uses a dictionary
/// - `error.MalformedFrame` if a frame in `src` is invalid
/// - `error.UnknownContentSizeUnsupported` if a frame in `src` does not
/// declare its content size
pub fn decode(dest: []u8, src: []const u8, verify_checksum: bool) error{
MalformedFrame,
UnknownContentSizeUnsupported,
DictionaryIdFlagUnsupported,
}!usize {
var write_count: usize = 0;
var read_count: usize = 0;
while (read_count < src.len) {
const counts = decodeFrame(dest, src[read_count..], verify_checksum) catch |err| {
switch (err) {
error.UnknownContentSizeUnsupported => return error.UnknownContentSizeUnsupported,
error.DictionaryIdFlagUnsupported => return error.DictionaryIdFlagUnsupported,
else => return error.MalformedFrame,
}
};
read_count += counts.read_count;
write_count += counts.write_count;
}
return write_count;
}
/// Decodes a stream of frames from `src`; returns the decoded bytes. The stream
/// should not have extra trailing bytes - either all bytes in `src` will be
/// decoded, or an error will be returned.
///
/// Errors returned:
/// - `error.DictionaryIdFlagUnsupported` if a `src` contains a frame that
/// uses a dictionary
/// - `error.MalformedFrame` if a frame in `src` is invalid
/// - `error.OutOfMemory` if `allocator` cannot allocate enough memory
pub fn decodeAlloc(
allocator: Allocator,
src: []const u8,
verify_checksum: bool,
window_size_max: usize,
) error{ DictionaryIdFlagUnsupported, MalformedFrame, OutOfMemory }![]u8 {
var result = std.ArrayList(u8).init(allocator);
errdefer result.deinit();
var read_count: usize = 0;
while (read_count < src.len) {
read_count += decodeFrameArrayList(
allocator,
&result,
src[read_count..],
verify_checksum,
window_size_max,
) catch |err| switch (err) {
error.OutOfMemory => return error.OutOfMemory,
error.DictionaryIdFlagUnsupported => return error.DictionaryIdFlagUnsupported,
else => return error.MalformedFrame,
};
}
return result.toOwnedSlice();
}
/// Decodes the frame at the start of `src` into `dest`. Returns the number of
/// bytes read from `src` and written to `dest`. This function can only decode
/// frames that declare the decompressed content size.
///
/// Errors returned:
/// - `error.BadMagic` if the first 4 bytes of `src` is not a valid magic
/// number for a Zstandard or skippable frame
/// - `error.UnknownContentSizeUnsupported` if the frame does not declare the
/// uncompressed content size
/// - `error.WindowSizeUnknown` if the frame does not have a valid window size
/// - `error.ContentTooLarge` if `dest` is smaller than the uncompressed data
/// size declared by the frame header
/// - `error.ContentSizeTooLarge` if the frame header indicates a content size
/// that is larger than `std.math.maxInt(usize)`
/// - `error.DictionaryIdFlagUnsupported` if the frame uses a dictionary
/// - `error.ChecksumFailure` if `verify_checksum` is true and the frame
/// contains a checksum that does not match the checksum of the decompressed
/// data
/// - `error.ReservedBitSet` if any of the reserved bits of the frame header
/// are set
/// - `error.EndOfStream` if `src` does not contain a complete frame
/// - `error.BadContentSize` if the content size declared by the frame does
/// not equal the actual size of decompressed data
/// - an error in `block.Error` if there are errors decoding a block
/// - `error.SkippableSizeTooLarge` if the frame is skippable and reports a
/// size greater than `src.len`
pub fn decodeFrame(
dest: []u8,
src: []const u8,
verify_checksum: bool,
) (error{
BadMagic,
UnknownContentSizeUnsupported,
ContentTooLarge,
ContentSizeTooLarge,
WindowSizeUnknown,
DictionaryIdFlagUnsupported,
SkippableSizeTooLarge,
} || FrameError)!ReadWriteCount {
var fbs = std.io.fixedBufferStream(src);
switch (try decodeFrameType(fbs.reader())) {
.zstandard => return decodeZstandardFrame(dest, src, verify_checksum),
.skippable => {
const content_size = try fbs.reader().readIntLittle(u32);
if (content_size > std.math.maxInt(usize) - 8) return error.SkippableSizeTooLarge;
const read_count = @as(usize, content_size) + 8;
if (read_count > src.len) return error.SkippableSizeTooLarge;
return ReadWriteCount{
.read_count = read_count,
.write_count = 0,
};
},
}
}
/// Decodes the frame at the start of `src` into `dest`. Returns the number of
/// bytes read from `src`.
///
/// Errors returned:
/// - `error.BadMagic` if the first 4 bytes of `src` is not a valid magic
/// number for a Zstandard or skippable frame
/// - `error.WindowSizeUnknown` if the frame does not have a valid window size
/// - `error.WindowTooLarge` if the window size is larger than
/// `window_size_max`
/// - `error.ContentSizeTooLarge` if the frame header indicates a content size
/// that is larger than `std.math.maxInt(usize)`
/// - `error.DictionaryIdFlagUnsupported` if the frame uses a dictionary
/// - `error.ChecksumFailure` if `verify_checksum` is true and the frame
/// contains a checksum that does not match the checksum of the decompressed
/// data
/// - `error.ReservedBitSet` if any of the reserved bits of the frame header
/// are set
/// - `error.EndOfStream` if `src` does not contain a complete frame
/// - `error.BadContentSize` if the content size declared by the frame does
/// not equal the actual size of decompressed data
/// - `error.OutOfMemory` if `allocator` cannot allocate enough memory
/// - an error in `block.Error` if there are errors decoding a block
/// - `error.SkippableSizeTooLarge` if the frame is skippable and reports a
/// size greater than `src.len`
pub fn decodeFrameArrayList(
allocator: Allocator,
dest: *std.ArrayList(u8),
src: []const u8,
verify_checksum: bool,
window_size_max: usize,
) (error{ BadMagic, OutOfMemory, SkippableSizeTooLarge } || FrameContext.Error || FrameError)!usize {
var fbs = std.io.fixedBufferStream(src);
const reader = fbs.reader();
const magic = try reader.readIntLittle(u32);
switch (try frameType(magic)) {
.zstandard => return decodeZstandardFrameArrayList(
allocator,
dest,
src,
verify_checksum,
window_size_max,
),
.skippable => {
const content_size = try fbs.reader().readIntLittle(u32);
if (content_size > std.math.maxInt(usize) - 8) return error.SkippableSizeTooLarge;
const read_count = @as(usize, content_size) + 8;
if (read_count > src.len) return error.SkippableSizeTooLarge;
return read_count;
},
}
}
/// Returns the frame checksum corresponding to the data fed into `hasher`
pub fn computeChecksum(hasher: *std.hash.XxHash64) u32 {
const hash = hasher.final();
return @intCast(u32, hash & 0xFFFFFFFF);
}
const FrameError = error{
ChecksumFailure,
BadContentSize,
EndOfStream,
ReservedBitSet,
} || block.Error;
/// Decode a Zstandard frame from `src` into `dest`, returning the number of
/// bytes read from `src` and written to `dest`. The first four bytes of `src`
/// must be the magic number for a Zstandard frame.
///
/// Error returned:
/// - `error.UnknownContentSizeUnsupported` if the frame does not declare the
/// uncompressed content size
/// - `error.ContentTooLarge` if `dest` is smaller than the uncompressed data
/// size declared by the frame header
/// - `error.WindowSizeUnknown` if the frame does not have a valid window size
/// - `error.DictionaryIdFlagUnsupported` if the frame uses a dictionary
/// - `error.ContentSizeTooLarge` if the frame header indicates a content size
/// that is larger than `std.math.maxInt(usize)`
/// - `error.ChecksumFailure` if `verify_checksum` is true and the frame
/// contains a checksum that does not match the checksum of the decompressed
/// data
/// - `error.ReservedBitSet` if the reserved bit of the frame header is set
/// - `error.EndOfStream` if `src` does not contain a complete frame
/// - an error in `block.Error` if there are errors decoding a block
/// - `error.BadContentSize` if the content size declared by the frame does
/// not equal the actual size of decompressed data
pub fn decodeZstandardFrame(
dest: []u8,
src: []const u8,
verify_checksum: bool,
) (error{
UnknownContentSizeUnsupported,
ContentTooLarge,
ContentSizeTooLarge,
WindowSizeUnknown,
DictionaryIdFlagUnsupported,
} || FrameError)!ReadWriteCount {
assert(readInt(u32, src[0..4]) == frame.Zstandard.magic_number);
var consumed_count: usize = 4;
var frame_context = context: {
var fbs = std.io.fixedBufferStream(src[consumed_count..]);
var source = fbs.reader();
const frame_header = try decodeZstandardHeader(source);
consumed_count += fbs.pos;
break :context FrameContext.init(
frame_header,
std.math.maxInt(usize),
verify_checksum,
) catch |err| switch (err) {
error.WindowTooLarge => unreachable,
inline else => |e| return e,
};
};
const counts = try decodeZStandardFrameBlocks(
dest,
src[consumed_count..],
&frame_context,
);
return ReadWriteCount{
.read_count = counts.read_count + consumed_count,
.write_count = counts.write_count,
};
}
pub fn decodeZStandardFrameBlocks(
dest: []u8,
src: []const u8,
frame_context: *FrameContext,
) (error{ ContentTooLarge, UnknownContentSizeUnsupported } || FrameError)!ReadWriteCount {
const content_size = frame_context.content_size orelse
return error.UnknownContentSizeUnsupported;
if (dest.len < content_size) return error.ContentTooLarge;
var consumed_count: usize = 0;
const written_count = decodeFrameBlocksInner(
dest[0..content_size],
src[consumed_count..],
&consumed_count,
if (frame_context.hasher_opt) |*hasher| hasher else null,
frame_context.block_size_max,
) catch |err| switch (err) {
error.DestTooSmall => return error.BadContentSize,
inline else => |e| return e,
};
if (written_count != content_size) return error.BadContentSize;
if (frame_context.has_checksum) {
if (src.len < consumed_count + 4) return error.EndOfStream;
const checksum = readIntSlice(u32, src[consumed_count .. consumed_count + 4]);
consumed_count += 4;
if (frame_context.hasher_opt) |*hasher| {
if (checksum != computeChecksum(hasher)) return error.ChecksumFailure;
}
}
return ReadWriteCount{ .read_count = consumed_count, .write_count = written_count };
}
pub const FrameContext = struct {
hasher_opt: ?std.hash.XxHash64,
window_size: usize,
has_checksum: bool,
block_size_max: usize,
content_size: ?usize,
const Error = error{
DictionaryIdFlagUnsupported,
WindowSizeUnknown,
WindowTooLarge,
ContentSizeTooLarge,
};
/// Validates `frame_header` and returns the associated `FrameContext`.
///
/// Errors returned:
/// - `error.DictionaryIdFlagUnsupported` if the frame uses a dictionary
/// - `error.WindowSizeUnknown` if the frame does not have a valid window
/// size
/// - `error.WindowTooLarge` if the window size is larger than
/// `window_size_max`
/// - `error.ContentSizeTooLarge` if the frame header indicates a content
/// size larger than `std.math.maxInt(usize)`
pub fn init(
frame_header: ZstandardHeader,
window_size_max: usize,
verify_checksum: bool,
) Error!FrameContext {
if (frame_header.descriptor.dictionary_id_flag != 0)
return error.DictionaryIdFlagUnsupported;
const window_size_raw = frameWindowSize(frame_header) orelse return error.WindowSizeUnknown;
const window_size = if (window_size_raw > window_size_max)
return error.WindowTooLarge
else
@intCast(usize, window_size_raw);
const should_compute_checksum =
frame_header.descriptor.content_checksum_flag and verify_checksum;
const content_size = if (frame_header.content_size) |size|
std.math.cast(usize, size) orelse return error.ContentSizeTooLarge
else
null;
return .{
.hasher_opt = if (should_compute_checksum) std.hash.XxHash64.init(0) else null,
.window_size = window_size,
.has_checksum = frame_header.descriptor.content_checksum_flag,
.block_size_max = @min(1 << 17, window_size),
.content_size = content_size,
};
}
};
/// Decode a Zstandard from from `src` and return number of bytes read; see
/// `decodeZstandardFrame()`. The first four bytes of `src` must be the magic
/// number for a Zstandard frame.
///
/// Errors returned:
/// - `error.WindowSizeUnknown` if the frame does not have a valid window size
/// - `error.WindowTooLarge` if the window size is larger than
/// `window_size_max`
/// - `error.DictionaryIdFlagUnsupported` if the frame uses a dictionary
/// - `error.ContentSizeTooLarge` if the frame header indicates a content size
/// that is larger than `std.math.maxInt(usize)`
/// - `error.ChecksumFailure` if `verify_checksum` is true and the frame
/// contains a checksum that does not match the checksum of the decompressed
/// data
/// - `error.ReservedBitSet` if the reserved bit of the frame header is set
/// - `error.EndOfStream` if `src` does not contain a complete frame
/// - `error.OutOfMemory` if `allocator` cannot allocate enough memory
/// - an error in `block.Error` if there are errors decoding a block
/// - `error.BadContentSize` if the content size declared by the frame does
/// not equal the size of decompressed data
pub fn decodeZstandardFrameArrayList(
allocator: Allocator,
dest: *std.ArrayList(u8),
src: []const u8,
verify_checksum: bool,
window_size_max: usize,
) (error{OutOfMemory} || FrameContext.Error || FrameError)!usize {
assert(readInt(u32, src[0..4]) == frame.Zstandard.magic_number);
var consumed_count: usize = 4;
var frame_context = context: {
var fbs = std.io.fixedBufferStream(src[consumed_count..]);
var source = fbs.reader();
const frame_header = try decodeZstandardHeader(source);
consumed_count += fbs.pos;
break :context try FrameContext.init(frame_header, window_size_max, verify_checksum);
};
consumed_count += try decodeZstandardFrameBlocksArrayList(
allocator,
dest,
src[consumed_count..],
&frame_context,
);
return consumed_count;
}
pub fn decodeZstandardFrameBlocksArrayList(
allocator: Allocator,
dest: *std.ArrayList(u8),
src: []const u8,
frame_context: *FrameContext,
) (error{OutOfMemory} || FrameError)!usize {
const initial_len = dest.items.len;
var ring_buffer = try RingBuffer.init(allocator, frame_context.window_size);
defer ring_buffer.deinit(allocator);
// These tables take 7680 bytes
var literal_fse_data: [types.compressed_block.table_size_max.literal]Table.Fse = undefined;
var match_fse_data: [types.compressed_block.table_size_max.match]Table.Fse = undefined;
var offset_fse_data: [types.compressed_block.table_size_max.offset]Table.Fse = undefined;
var block_header = try block.decodeBlockHeaderSlice(src);
var consumed_count: usize = 3;
var decode_state = block.DecodeState.init(&literal_fse_data, &match_fse_data, &offset_fse_data);
while (true) : ({
block_header = try block.decodeBlockHeaderSlice(src[consumed_count..]);
consumed_count += 3;
}) {
const written_size = try block.decodeBlockRingBuffer(
&ring_buffer,
src[consumed_count..],
block_header,
&decode_state,
&consumed_count,
frame_context.block_size_max,
);
if (frame_context.content_size) |size| {
if (dest.items.len - initial_len > size) {
return error.BadContentSize;
}
}
if (written_size > 0) {
const written_slice = ring_buffer.sliceLast(written_size);
try dest.appendSlice(written_slice.first);
try dest.appendSlice(written_slice.second);
if (frame_context.hasher_opt) |*hasher| {
hasher.update(written_slice.first);
hasher.update(written_slice.second);
}
}
if (block_header.last_block) break;
}
if (frame_context.content_size) |size| {
if (dest.items.len - initial_len != size) {
return error.BadContentSize;
}
}
if (frame_context.has_checksum) {
if (src.len < consumed_count + 4) return error.EndOfStream;
const checksum = readIntSlice(u32, src[consumed_count .. consumed_count + 4]);
consumed_count += 4;
if (frame_context.hasher_opt) |*hasher| {
if (checksum != computeChecksum(hasher)) return error.ChecksumFailure;
}
}
return consumed_count;
}
fn decodeFrameBlocksInner(
dest: []u8,
src: []const u8,
consumed_count: *usize,
hash: ?*std.hash.XxHash64,
block_size_max: usize,
) (error{ EndOfStream, DestTooSmall } || block.Error)!usize {
// These tables take 7680 bytes
var literal_fse_data: [types.compressed_block.table_size_max.literal]Table.Fse = undefined;
var match_fse_data: [types.compressed_block.table_size_max.match]Table.Fse = undefined;
var offset_fse_data: [types.compressed_block.table_size_max.offset]Table.Fse = undefined;
var block_header = try block.decodeBlockHeaderSlice(src);
var bytes_read: usize = 3;
defer consumed_count.* += bytes_read;
var decode_state = block.DecodeState.init(&literal_fse_data, &match_fse_data, &offset_fse_data);
var count: usize = 0;
while (true) : ({
block_header = try block.decodeBlockHeaderSlice(src[bytes_read..]);
bytes_read += 3;
}) {
const written_size = try block.decodeBlock(
dest,
src[bytes_read..],
block_header,
&decode_state,
&bytes_read,
block_size_max,
count,
);
if (hash) |hash_state| hash_state.update(dest[count .. count + written_size]);
count += written_size;
if (block_header.last_block) break;
}
return count;
}
/// Decode the header of a skippable frame. The first four bytes of `src` must
/// be a valid magic number for a skippable frame.
pub fn decodeSkippableHeader(src: *const [8]u8) SkippableHeader {
const magic = readInt(u32, src[0..4]);
assert(isSkippableMagic(magic));
const frame_size = readInt(u32, src[4..8]);
return .{
.magic_number = magic,
.frame_size = frame_size,
};
}
/// Returns the window size required to decompress a frame, or `null` if it
/// cannot be determined (which indicates a malformed frame header).
pub fn frameWindowSize(header: ZstandardHeader) ?u64 {
if (header.window_descriptor) |descriptor| {
const exponent = (descriptor & 0b11111000) >> 3;
const mantissa = descriptor & 0b00000111;
const window_log = 10 + exponent;
const window_base = @as(u64, 1) << @intCast(u6, window_log);
const window_add = (window_base / 8) * mantissa;
return window_base + window_add;
} else return header.content_size;
}
/// Decode the header of a Zstandard frame.
///
/// Errors returned:
/// - `error.ReservedBitSet` if any of the reserved bits of the header are set
/// - `error.EndOfStream` if `source` does not contain a complete header
pub fn decodeZstandardHeader(
source: anytype,
) (@TypeOf(source).Error || error{ EndOfStream, ReservedBitSet })!ZstandardHeader {
const descriptor = @bitCast(ZstandardHeader.Descriptor, try source.readByte());
if (descriptor.reserved) return error.ReservedBitSet;
var window_descriptor: ?u8 = null;
if (!descriptor.single_segment_flag) {
window_descriptor = try source.readByte();
}
var dictionary_id: ?u32 = null;
if (descriptor.dictionary_id_flag > 0) {
// if flag is 3 then field_size = 4, else field_size = flag
const field_size = (@as(u4, 1) << descriptor.dictionary_id_flag) >> 1;
dictionary_id = try source.readVarInt(u32, .Little, field_size);
}
var content_size: ?u64 = null;
if (descriptor.single_segment_flag or descriptor.content_size_flag > 0) {
const field_size = @as(u4, 1) << descriptor.content_size_flag;
content_size = try source.readVarInt(u64, .Little, field_size);
if (field_size == 2) content_size.? += 256;
}
const header = ZstandardHeader{
.descriptor = descriptor,
.window_descriptor = window_descriptor,
.dictionary_id = dictionary_id,
.content_size = content_size,
};
return header;
}
test {
std.testing.refAllDecls(@This());
}

View File

@ -0,0 +1,82 @@
const std = @import("std");
pub const ReversedByteReader = struct {
remaining_bytes: usize,
bytes: []const u8,
const Reader = std.io.Reader(*ReversedByteReader, error{}, readFn);
pub fn init(bytes: []const u8) ReversedByteReader {
return .{
.bytes = bytes,
.remaining_bytes = bytes.len,
};
}
pub fn reader(self: *ReversedByteReader) Reader {
return .{ .context = self };
}
fn readFn(ctx: *ReversedByteReader, buffer: []u8) !usize {
if (ctx.remaining_bytes == 0) return 0;
const byte_index = ctx.remaining_bytes - 1;
buffer[0] = ctx.bytes[byte_index];
// buffer[0] = @bitReverse(ctx.bytes[byte_index]);
ctx.remaining_bytes = byte_index;
return 1;
}
};
/// A bit reader for reading the reversed bit streams used to encode
/// FSE compressed data.
pub const ReverseBitReader = struct {
byte_reader: ReversedByteReader,
bit_reader: std.io.BitReader(.Big, ReversedByteReader.Reader),
pub fn init(self: *ReverseBitReader, bytes: []const u8) error{BitStreamHasNoStartBit}!void {
self.byte_reader = ReversedByteReader.init(bytes);
self.bit_reader = std.io.bitReader(.Big, self.byte_reader.reader());
if (bytes.len == 0) return;
var i: usize = 0;
while (i < 8 and 0 == self.readBitsNoEof(u1, 1) catch unreachable) : (i += 1) {}
if (i == 8) return error.BitStreamHasNoStartBit;
}
pub fn readBitsNoEof(self: *@This(), comptime U: type, num_bits: usize) error{EndOfStream}!U {
return self.bit_reader.readBitsNoEof(U, num_bits);
}
pub fn readBits(self: *@This(), comptime U: type, num_bits: usize, out_bits: *usize) error{}!U {
return try self.bit_reader.readBits(U, num_bits, out_bits);
}
pub fn alignToByte(self: *@This()) void {
self.bit_reader.alignToByte();
}
pub fn isEmpty(self: ReverseBitReader) bool {
return self.byte_reader.remaining_bytes == 0 and self.bit_reader.bit_count == 0;
}
};
pub fn BitReader(comptime Reader: type) type {
return struct {
underlying: std.io.BitReader(.Little, Reader),
pub fn readBitsNoEof(self: *@This(), comptime U: type, num_bits: usize) !U {
return self.underlying.readBitsNoEof(U, num_bits);
}
pub fn readBits(self: *@This(), comptime U: type, num_bits: usize, out_bits: *usize) !U {
return self.underlying.readBits(U, num_bits, out_bits);
}
pub fn alignToByte(self: *@This()) void {
self.underlying.alignToByte();
}
};
}
pub fn bitReader(reader: anytype) BitReader(@TypeOf(reader)) {
return .{ .underlying = std.io.bitReader(.Little, reader) };
}

View File

@ -0,0 +1,401 @@
pub const frame = struct {
pub const Kind = enum { zstandard, skippable };
pub const Zstandard = struct {
pub const magic_number = 0xFD2FB528;
header: Header,
data_blocks: []Block,
checksum: ?u32,
pub const Header = struct {
descriptor: Descriptor,
window_descriptor: ?u8,
dictionary_id: ?u32,
content_size: ?u64,
pub const Descriptor = packed struct {
dictionary_id_flag: u2,
content_checksum_flag: bool,
reserved: bool,
unused: bool,
single_segment_flag: bool,
content_size_flag: u2,
};
};
pub const Block = struct {
pub const Header = struct {
last_block: bool,
block_type: Block.Type,
block_size: u21,
};
pub const Type = enum(u2) {
raw,
rle,
compressed,
reserved,
};
};
};
pub const Skippable = struct {
pub const magic_number_min = 0x184D2A50;
pub const magic_number_max = 0x184D2A5F;
pub const Header = struct {
magic_number: u32,
frame_size: u32,
};
};
};
pub const compressed_block = struct {
pub const LiteralsSection = struct {
header: Header,
huffman_tree: ?HuffmanTree,
streams: Streams,
pub const Streams = union(enum) {
one: []const u8,
four: [4][]const u8,
};
pub const Header = struct {
block_type: BlockType,
size_format: u2,
regenerated_size: u20,
compressed_size: ?u18,
};
pub const BlockType = enum(u2) {
raw,
rle,
compressed,
treeless,
};
pub const HuffmanTree = struct {
max_bit_count: u4,
symbol_count_minus_one: u8,
nodes: [256]PrefixedSymbol,
pub const PrefixedSymbol = struct {
symbol: u8,
prefix: u16,
weight: u4,
};
pub const Result = union(enum) {
symbol: u8,
index: usize,
};
pub fn query(self: HuffmanTree, index: usize, prefix: u16) error{NotFound}!Result {
var node = self.nodes[index];
const weight = node.weight;
var i: usize = index;
while (node.weight == weight) {
if (node.prefix == prefix) return Result{ .symbol = node.symbol };
if (i == 0) return error.NotFound;
i -= 1;
node = self.nodes[i];
}
return Result{ .index = i };
}
pub fn weightToBitCount(weight: u4, max_bit_count: u4) u4 {
return if (weight == 0) 0 else ((max_bit_count + 1) - weight);
}
};
pub const StreamCount = enum { one, four };
pub fn streamCount(size_format: u2, block_type: BlockType) StreamCount {
return switch (block_type) {
.raw, .rle => .one,
.compressed, .treeless => if (size_format == 0) .one else .four,
};
}
};
pub const SequencesSection = struct {
header: SequencesSection.Header,
literals_length_table: Table,
offset_table: Table,
match_length_table: Table,
pub const Header = struct {
sequence_count: u24,
match_lengths: Mode,
offsets: Mode,
literal_lengths: Mode,
pub const Mode = enum(u2) {
predefined,
rle,
fse,
repeat,
};
};
};
pub const Table = union(enum) {
fse: []const Fse,
rle: u8,
pub const Fse = struct {
symbol: u8,
baseline: u16,
bits: u8,
};
};
pub const literals_length_code_table = [36]struct { u32, u5 }{
.{ 0, 0 }, .{ 1, 0 }, .{ 2, 0 }, .{ 3, 0 },
.{ 4, 0 }, .{ 5, 0 }, .{ 6, 0 }, .{ 7, 0 },
.{ 8, 0 }, .{ 9, 0 }, .{ 10, 0 }, .{ 11, 0 },
.{ 12, 0 }, .{ 13, 0 }, .{ 14, 0 }, .{ 15, 0 },
.{ 16, 1 }, .{ 18, 1 }, .{ 20, 1 }, .{ 22, 1 },
.{ 24, 2 }, .{ 28, 2 }, .{ 32, 3 }, .{ 40, 3 },
.{ 48, 4 }, .{ 64, 6 }, .{ 128, 7 }, .{ 256, 8 },
.{ 512, 9 }, .{ 1024, 10 }, .{ 2048, 11 }, .{ 4096, 12 },
.{ 8192, 13 }, .{ 16384, 14 }, .{ 32768, 15 }, .{ 65536, 16 },
};
pub const match_length_code_table = [53]struct { u32, u5 }{
.{ 3, 0 }, .{ 4, 0 }, .{ 5, 0 }, .{ 6, 0 }, .{ 7, 0 }, .{ 8, 0 },
.{ 9, 0 }, .{ 10, 0 }, .{ 11, 0 }, .{ 12, 0 }, .{ 13, 0 }, .{ 14, 0 },
.{ 15, 0 }, .{ 16, 0 }, .{ 17, 0 }, .{ 18, 0 }, .{ 19, 0 }, .{ 20, 0 },
.{ 21, 0 }, .{ 22, 0 }, .{ 23, 0 }, .{ 24, 0 }, .{ 25, 0 }, .{ 26, 0 },
.{ 27, 0 }, .{ 28, 0 }, .{ 29, 0 }, .{ 30, 0 }, .{ 31, 0 }, .{ 32, 0 },
.{ 33, 0 }, .{ 34, 0 }, .{ 35, 1 }, .{ 37, 1 }, .{ 39, 1 }, .{ 41, 1 },
.{ 43, 2 }, .{ 47, 2 }, .{ 51, 3 }, .{ 59, 3 }, .{ 67, 4 }, .{ 83, 4 },
.{ 99, 5 }, .{ 131, 7 }, .{ 259, 8 }, .{ 515, 9 }, .{ 1027, 10 }, .{ 2051, 11 },
.{ 4099, 12 }, .{ 8195, 13 }, .{ 16387, 14 }, .{ 32771, 15 }, .{ 65539, 16 },
};
pub const literals_length_default_distribution = [36]i16{
4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
-1, -1, -1, -1,
};
pub const match_lengths_default_distribution = [53]i16{
1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1,
-1, -1, -1, -1, -1,
};
pub const offset_codes_default_distribution = [29]i16{
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
};
pub const predefined_literal_fse_table = Table{
.fse = &[64]Table.Fse{
.{ .symbol = 0, .bits = 4, .baseline = 0 },
.{ .symbol = 0, .bits = 4, .baseline = 16 },
.{ .symbol = 1, .bits = 5, .baseline = 32 },
.{ .symbol = 3, .bits = 5, .baseline = 0 },
.{ .symbol = 4, .bits = 5, .baseline = 0 },
.{ .symbol = 6, .bits = 5, .baseline = 0 },
.{ .symbol = 7, .bits = 5, .baseline = 0 },
.{ .symbol = 9, .bits = 5, .baseline = 0 },
.{ .symbol = 10, .bits = 5, .baseline = 0 },
.{ .symbol = 12, .bits = 5, .baseline = 0 },
.{ .symbol = 14, .bits = 6, .baseline = 0 },
.{ .symbol = 16, .bits = 5, .baseline = 0 },
.{ .symbol = 18, .bits = 5, .baseline = 0 },
.{ .symbol = 19, .bits = 5, .baseline = 0 },
.{ .symbol = 21, .bits = 5, .baseline = 0 },
.{ .symbol = 22, .bits = 5, .baseline = 0 },
.{ .symbol = 24, .bits = 5, .baseline = 0 },
.{ .symbol = 25, .bits = 5, .baseline = 32 },
.{ .symbol = 26, .bits = 5, .baseline = 0 },
.{ .symbol = 27, .bits = 6, .baseline = 0 },
.{ .symbol = 29, .bits = 6, .baseline = 0 },
.{ .symbol = 31, .bits = 6, .baseline = 0 },
.{ .symbol = 0, .bits = 4, .baseline = 32 },
.{ .symbol = 1, .bits = 4, .baseline = 0 },
.{ .symbol = 2, .bits = 5, .baseline = 0 },
.{ .symbol = 4, .bits = 5, .baseline = 32 },
.{ .symbol = 5, .bits = 5, .baseline = 0 },
.{ .symbol = 7, .bits = 5, .baseline = 32 },
.{ .symbol = 8, .bits = 5, .baseline = 0 },
.{ .symbol = 10, .bits = 5, .baseline = 32 },
.{ .symbol = 11, .bits = 5, .baseline = 0 },
.{ .symbol = 13, .bits = 6, .baseline = 0 },
.{ .symbol = 16, .bits = 5, .baseline = 32 },
.{ .symbol = 17, .bits = 5, .baseline = 0 },
.{ .symbol = 19, .bits = 5, .baseline = 32 },
.{ .symbol = 20, .bits = 5, .baseline = 0 },
.{ .symbol = 22, .bits = 5, .baseline = 32 },
.{ .symbol = 23, .bits = 5, .baseline = 0 },
.{ .symbol = 25, .bits = 4, .baseline = 0 },
.{ .symbol = 25, .bits = 4, .baseline = 16 },
.{ .symbol = 26, .bits = 5, .baseline = 32 },
.{ .symbol = 28, .bits = 6, .baseline = 0 },
.{ .symbol = 30, .bits = 6, .baseline = 0 },
.{ .symbol = 0, .bits = 4, .baseline = 48 },
.{ .symbol = 1, .bits = 4, .baseline = 16 },
.{ .symbol = 2, .bits = 5, .baseline = 32 },
.{ .symbol = 3, .bits = 5, .baseline = 32 },
.{ .symbol = 5, .bits = 5, .baseline = 32 },
.{ .symbol = 6, .bits = 5, .baseline = 32 },
.{ .symbol = 8, .bits = 5, .baseline = 32 },
.{ .symbol = 9, .bits = 5, .baseline = 32 },
.{ .symbol = 11, .bits = 5, .baseline = 32 },
.{ .symbol = 12, .bits = 5, .baseline = 32 },
.{ .symbol = 15, .bits = 6, .baseline = 0 },
.{ .symbol = 17, .bits = 5, .baseline = 32 },
.{ .symbol = 18, .bits = 5, .baseline = 32 },
.{ .symbol = 20, .bits = 5, .baseline = 32 },
.{ .symbol = 21, .bits = 5, .baseline = 32 },
.{ .symbol = 23, .bits = 5, .baseline = 32 },
.{ .symbol = 24, .bits = 5, .baseline = 32 },
.{ .symbol = 35, .bits = 6, .baseline = 0 },
.{ .symbol = 34, .bits = 6, .baseline = 0 },
.{ .symbol = 33, .bits = 6, .baseline = 0 },
.{ .symbol = 32, .bits = 6, .baseline = 0 },
},
};
pub const predefined_match_fse_table = Table{
.fse = &[64]Table.Fse{
.{ .symbol = 0, .bits = 6, .baseline = 0 },
.{ .symbol = 1, .bits = 4, .baseline = 0 },
.{ .symbol = 2, .bits = 5, .baseline = 32 },
.{ .symbol = 3, .bits = 5, .baseline = 0 },
.{ .symbol = 5, .bits = 5, .baseline = 0 },
.{ .symbol = 6, .bits = 5, .baseline = 0 },
.{ .symbol = 8, .bits = 5, .baseline = 0 },
.{ .symbol = 10, .bits = 6, .baseline = 0 },
.{ .symbol = 13, .bits = 6, .baseline = 0 },
.{ .symbol = 16, .bits = 6, .baseline = 0 },
.{ .symbol = 19, .bits = 6, .baseline = 0 },
.{ .symbol = 22, .bits = 6, .baseline = 0 },
.{ .symbol = 25, .bits = 6, .baseline = 0 },
.{ .symbol = 28, .bits = 6, .baseline = 0 },
.{ .symbol = 31, .bits = 6, .baseline = 0 },
.{ .symbol = 33, .bits = 6, .baseline = 0 },
.{ .symbol = 35, .bits = 6, .baseline = 0 },
.{ .symbol = 37, .bits = 6, .baseline = 0 },
.{ .symbol = 39, .bits = 6, .baseline = 0 },
.{ .symbol = 41, .bits = 6, .baseline = 0 },
.{ .symbol = 43, .bits = 6, .baseline = 0 },
.{ .symbol = 45, .bits = 6, .baseline = 0 },
.{ .symbol = 1, .bits = 4, .baseline = 16 },
.{ .symbol = 2, .bits = 4, .baseline = 0 },
.{ .symbol = 3, .bits = 5, .baseline = 32 },
.{ .symbol = 4, .bits = 5, .baseline = 0 },
.{ .symbol = 6, .bits = 5, .baseline = 32 },
.{ .symbol = 7, .bits = 5, .baseline = 0 },
.{ .symbol = 9, .bits = 6, .baseline = 0 },
.{ .symbol = 12, .bits = 6, .baseline = 0 },
.{ .symbol = 15, .bits = 6, .baseline = 0 },
.{ .symbol = 18, .bits = 6, .baseline = 0 },
.{ .symbol = 21, .bits = 6, .baseline = 0 },
.{ .symbol = 24, .bits = 6, .baseline = 0 },
.{ .symbol = 27, .bits = 6, .baseline = 0 },
.{ .symbol = 30, .bits = 6, .baseline = 0 },
.{ .symbol = 32, .bits = 6, .baseline = 0 },
.{ .symbol = 34, .bits = 6, .baseline = 0 },
.{ .symbol = 36, .bits = 6, .baseline = 0 },
.{ .symbol = 38, .bits = 6, .baseline = 0 },
.{ .symbol = 40, .bits = 6, .baseline = 0 },
.{ .symbol = 42, .bits = 6, .baseline = 0 },
.{ .symbol = 44, .bits = 6, .baseline = 0 },
.{ .symbol = 1, .bits = 4, .baseline = 32 },
.{ .symbol = 1, .bits = 4, .baseline = 48 },
.{ .symbol = 2, .bits = 4, .baseline = 16 },
.{ .symbol = 4, .bits = 5, .baseline = 32 },
.{ .symbol = 5, .bits = 5, .baseline = 32 },
.{ .symbol = 7, .bits = 5, .baseline = 32 },
.{ .symbol = 8, .bits = 5, .baseline = 32 },
.{ .symbol = 11, .bits = 6, .baseline = 0 },
.{ .symbol = 14, .bits = 6, .baseline = 0 },
.{ .symbol = 17, .bits = 6, .baseline = 0 },
.{ .symbol = 20, .bits = 6, .baseline = 0 },
.{ .symbol = 23, .bits = 6, .baseline = 0 },
.{ .symbol = 26, .bits = 6, .baseline = 0 },
.{ .symbol = 29, .bits = 6, .baseline = 0 },
.{ .symbol = 52, .bits = 6, .baseline = 0 },
.{ .symbol = 51, .bits = 6, .baseline = 0 },
.{ .symbol = 50, .bits = 6, .baseline = 0 },
.{ .symbol = 49, .bits = 6, .baseline = 0 },
.{ .symbol = 48, .bits = 6, .baseline = 0 },
.{ .symbol = 47, .bits = 6, .baseline = 0 },
.{ .symbol = 46, .bits = 6, .baseline = 0 },
},
};
pub const predefined_offset_fse_table = Table{
.fse = &[32]Table.Fse{
.{ .symbol = 0, .bits = 5, .baseline = 0 },
.{ .symbol = 6, .bits = 4, .baseline = 0 },
.{ .symbol = 9, .bits = 5, .baseline = 0 },
.{ .symbol = 15, .bits = 5, .baseline = 0 },
.{ .symbol = 21, .bits = 5, .baseline = 0 },
.{ .symbol = 3, .bits = 5, .baseline = 0 },
.{ .symbol = 7, .bits = 4, .baseline = 0 },
.{ .symbol = 12, .bits = 5, .baseline = 0 },
.{ .symbol = 18, .bits = 5, .baseline = 0 },
.{ .symbol = 23, .bits = 5, .baseline = 0 },
.{ .symbol = 5, .bits = 5, .baseline = 0 },
.{ .symbol = 8, .bits = 4, .baseline = 0 },
.{ .symbol = 14, .bits = 5, .baseline = 0 },
.{ .symbol = 20, .bits = 5, .baseline = 0 },
.{ .symbol = 2, .bits = 5, .baseline = 0 },
.{ .symbol = 7, .bits = 4, .baseline = 16 },
.{ .symbol = 11, .bits = 5, .baseline = 0 },
.{ .symbol = 17, .bits = 5, .baseline = 0 },
.{ .symbol = 22, .bits = 5, .baseline = 0 },
.{ .symbol = 4, .bits = 5, .baseline = 0 },
.{ .symbol = 8, .bits = 4, .baseline = 16 },
.{ .symbol = 13, .bits = 5, .baseline = 0 },
.{ .symbol = 19, .bits = 5, .baseline = 0 },
.{ .symbol = 1, .bits = 5, .baseline = 0 },
.{ .symbol = 6, .bits = 4, .baseline = 16 },
.{ .symbol = 10, .bits = 5, .baseline = 0 },
.{ .symbol = 16, .bits = 5, .baseline = 0 },
.{ .symbol = 28, .bits = 5, .baseline = 0 },
.{ .symbol = 27, .bits = 5, .baseline = 0 },
.{ .symbol = 26, .bits = 5, .baseline = 0 },
.{ .symbol = 25, .bits = 5, .baseline = 0 },
.{ .symbol = 24, .bits = 5, .baseline = 0 },
},
};
pub const start_repeated_offset_1 = 1;
pub const start_repeated_offset_2 = 4;
pub const start_repeated_offset_3 = 8;
pub const table_accuracy_log_max = struct {
pub const literal = 9;
pub const match = 9;
pub const offset = 8;
};
pub const table_symbol_count_max = struct {
pub const literal = 36;
pub const match = 53;
pub const offset = 32;
};
pub const default_accuracy_log = struct {
pub const literal = 6;
pub const match = 6;
pub const offset = 5;
};
pub const table_size_max = struct {
pub const literal = 1 << table_accuracy_log_max.literal;
pub const match = 1 << table_accuracy_log_max.match;
pub const offset = 1 << table_accuracy_log_max.match;
};
};
test {
const testing = @import("std").testing;
testing.refAllDeclsRecursive(@This());
}

View File

@ -32,6 +32,10 @@ pub const CityHash64 = cityhash.CityHash64;
const wyhash = @import("hash/wyhash.zig");
pub const Wyhash = wyhash.Wyhash;
const xxhash = @import("hash/xxhash.zig");
pub const XxHash64 = xxhash.XxHash64;
pub const XxHash32 = xxhash.XxHash32;
test "hash" {
_ = adler;
_ = auto_hash;
@ -40,4 +44,5 @@ test "hash" {
_ = murmur;
_ = cityhash;
_ = wyhash;
_ = xxhash;
}

268
lib/std/hash/xxhash.zig Normal file
View File

@ -0,0 +1,268 @@
const std = @import("std");
const mem = std.mem;
const expectEqual = std.testing.expectEqual;
const rotl = std.math.rotl;
pub const XxHash64 = struct {
acc1: u64,
acc2: u64,
acc3: u64,
acc4: u64,
seed: u64,
buf: [32]u8,
buf_len: usize,
byte_count: usize,
const prime_1 = 0x9E3779B185EBCA87; // 0b1001111000110111011110011011000110000101111010111100101010000111
const prime_2 = 0xC2B2AE3D27D4EB4F; // 0b1100001010110010101011100011110100100111110101001110101101001111
const prime_3 = 0x165667B19E3779F9; // 0b0001011001010110011001111011000110011110001101110111100111111001
const prime_4 = 0x85EBCA77C2B2AE63; // 0b1000010111101011110010100111011111000010101100101010111001100011
const prime_5 = 0x27D4EB2F165667C5; // 0b0010011111010100111010110010111100010110010101100110011111000101
pub fn init(seed: u64) XxHash64 {
return XxHash64{
.seed = seed,
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
.buf = undefined,
.buf_len = 0,
.byte_count = 0,
};
}
pub fn update(self: *XxHash64, input: []const u8) void {
if (input.len < 32 - self.buf_len) {
mem.copy(u8, self.buf[self.buf_len..], input);
self.buf_len += input.len;
return;
}
var i: usize = 0;
if (self.buf_len > 0) {
i = 32 - self.buf_len;
mem.copy(u8, self.buf[self.buf_len..], input[0..i]);
self.processStripe(&self.buf);
self.buf_len = 0;
}
while (i + 32 <= input.len) : (i += 32) {
self.processStripe(input[i..][0..32]);
}
const remaining_bytes = input[i..];
mem.copy(u8, &self.buf, remaining_bytes);
self.buf_len = remaining_bytes.len;
}
inline fn processStripe(self: *XxHash64, buf: *const [32]u8) void {
self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
self.byte_count += 32;
}
inline fn round(acc: u64, lane: u64) u64 {
const a = acc +% (lane *% prime_2);
const b = rotl(u64, a, 31);
return b *% prime_1;
}
pub fn final(self: *XxHash64) u64 {
var acc: u64 = undefined;
if (self.byte_count < 32) {
acc = self.seed +% prime_5;
} else {
acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +%
rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18);
acc = mergeAccumulator(acc, self.acc1);
acc = mergeAccumulator(acc, self.acc2);
acc = mergeAccumulator(acc, self.acc3);
acc = mergeAccumulator(acc, self.acc4);
}
acc = acc +% @as(u64, self.byte_count) +% @as(u64, self.buf_len);
var pos: usize = 0;
while (pos + 8 <= self.buf_len) : (pos += 8) {
const lane = mem.readIntLittle(u64, self.buf[pos..][0..8]);
acc ^= round(0, lane);
acc = rotl(u64, acc, 27) *% prime_1;
acc +%= prime_4;
}
if (pos + 4 <= self.buf_len) {
const lane = @as(u64, mem.readIntLittle(u32, self.buf[pos..][0..4]));
acc ^= lane *% prime_1;
acc = rotl(u64, acc, 23) *% prime_2;
acc +%= prime_3;
pos += 4;
}
while (pos < self.buf_len) : (pos += 1) {
const lane = @as(u64, self.buf[pos]);
acc ^= lane *% prime_5;
acc = rotl(u64, acc, 11) *% prime_1;
}
acc ^= acc >> 33;
acc *%= prime_2;
acc ^= acc >> 29;
acc *%= prime_3;
acc ^= acc >> 32;
return acc;
}
inline fn mergeAccumulator(acc: u64, other: u64) u64 {
const a = acc ^ round(0, other);
const b = a *% prime_1;
return b +% prime_4;
}
pub fn hash(input: []const u8) u64 {
var hasher = XxHash64.init(0);
hasher.update(input);
return hasher.final();
}
};
pub const XxHash32 = struct {
acc1: u32,
acc2: u32,
acc3: u32,
acc4: u32,
seed: u32,
buf: [16]u8,
buf_len: usize,
byte_count: usize,
const prime_1 = 0x9E3779B1; // 0b10011110001101110111100110110001
const prime_2 = 0x85EBCA77; // 0b10000101111010111100101001110111
const prime_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101
const prime_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
const prime_5 = 0x165667B1; // 0b00010110010101100110011110110001
pub fn init(seed: u32) XxHash32 {
return XxHash32{
.seed = seed,
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
.buf = undefined,
.buf_len = 0,
.byte_count = 0,
};
}
pub fn update(self: *XxHash32, input: []const u8) void {
if (input.len < 16 - self.buf_len) {
mem.copy(u8, self.buf[self.buf_len..], input);
self.buf_len += input.len;
return;
}
var i: usize = 0;
if (self.buf_len > 0) {
i = 16 - self.buf_len;
mem.copy(u8, self.buf[self.buf_len..], input[0..i]);
self.processStripe(&self.buf);
self.buf_len = 0;
}
while (i + 16 <= input.len) : (i += 16) {
self.processStripe(input[i..][0..16]);
}
const remaining_bytes = input[i..];
mem.copy(u8, &self.buf, remaining_bytes);
self.buf_len = remaining_bytes.len;
}
inline fn processStripe(self: *XxHash32, buf: *const [16]u8) void {
self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
self.byte_count += 16;
}
inline fn round(acc: u32, lane: u32) u32 {
const a = acc +% (lane *% prime_2);
const b = rotl(u32, a, 13);
return b *% prime_1;
}
pub fn final(self: *XxHash32) u32 {
var acc: u32 = undefined;
if (self.byte_count < 16) {
acc = self.seed +% prime_5;
} else {
acc = rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +%
rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18);
}
acc = acc +% @intCast(u32, self.byte_count) +% @intCast(u32, self.buf_len);
var pos: usize = 0;
while (pos + 4 <= self.buf_len) : (pos += 4) {
const lane = mem.readIntLittle(u32, self.buf[pos..][0..4]);
acc +%= lane *% prime_3;
acc = rotl(u32, acc, 17) *% prime_4;
}
while (pos < self.buf_len) : (pos += 1) {
const lane = @as(u32, self.buf[pos]);
acc +%= lane *% prime_5;
acc = rotl(u32, acc, 11) *% prime_1;
}
acc ^= acc >> 15;
acc *%= prime_2;
acc ^= acc >> 13;
acc *%= prime_3;
acc ^= acc >> 16;
return acc;
}
pub fn hash(input: []const u8) u32 {
var hasher = XxHash32.init(0);
hasher.update(input);
return hasher.final();
}
};
test "xxhash64" {
const hash = XxHash64.hash;
try expectEqual(hash(""), 0xef46db3751d8e999);
try expectEqual(hash("a"), 0xd24ec4f1a98c6e5b);
try expectEqual(hash("abc"), 0x44bc2cf5ad770999);
try expectEqual(hash("message digest"), 0x066ed728fceeb3be);
try expectEqual(hash("abcdefghijklmnopqrstuvwxyz"), 0xcfe1f278fa89835c);
try expectEqual(hash("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0xaaa46907d3047814);
try expectEqual(hash("12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0xe04a477f19ee145d);
}
test "xxhash32" {
const hash = XxHash32.hash;
try expectEqual(hash(""), 0x02cc5d05);
try expectEqual(hash("a"), 0x550d7456);
try expectEqual(hash("abc"), 0x32d153ff);
try expectEqual(hash("message digest"), 0x7c948494);
try expectEqual(hash("abcdefghijklmnopqrstuvwxyz"), 0x63a14d5f);
try expectEqual(hash("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0x9c285e64);
try expectEqual(hash("12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0x9c05f475);
}

View File

@ -31,6 +31,7 @@ pub const PackedIntSliceEndian = @import("packed_int_array.zig").PackedIntSliceE
pub const PriorityQueue = @import("priority_queue.zig").PriorityQueue;
pub const PriorityDequeue = @import("priority_dequeue.zig").PriorityDequeue;
pub const Progress = @import("Progress.zig");
pub const RingBuffer = @import("RingBuffer.zig");
pub const SegmentedList = @import("segmented_list.zig").SegmentedList;
pub const SemanticVersion = @import("SemanticVersion.zig");
pub const SinglyLinkedList = @import("linked_list.zig").SinglyLinkedList;