mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 06:13:07 +00:00
1265 lines
48 KiB
Zig
1265 lines
48 KiB
Zig
//! Default compression algorithm. Has two steps: tokenization and token
|
|
//! encoding.
|
|
//!
|
|
//! Tokenization takes uncompressed input stream and produces list of tokens.
|
|
//! Each token can be literal (byte of data) or match (backrefernce to previous
|
|
//! data with length and distance). Tokenization accumulators 32K tokens, when
|
|
//! full or `flush` is called tokens are passed to the `block_writer`. Level
|
|
//! defines how hard (how slow) it tries to find match.
|
|
//!
|
|
//! Block writer will decide which type of deflate block to write (stored, fixed,
|
|
//! dynamic) and encode tokens to the output byte stream. Client has to call
|
|
//! `finish` to write block with the final bit set.
|
|
//!
|
|
//! Container defines type of header and footer which can be gzip, zlib or raw.
|
|
//! They all share same deflate body. Raw has no header or footer just deflate
|
|
//! body.
|
|
//!
|
|
//! Compression algorithm explained in rfc-1951 (slightly edited for this case):
|
|
//!
|
|
//! The compressor uses a chained hash table `lookup` to find duplicated
|
|
//! strings, using a hash function that operates on 4-byte sequences. At any
|
|
//! given point during compression, let XYZW be the next 4 input bytes
|
|
//! (lookahead) to be examined (not necessarily all different, of course).
|
|
//! First, the compressor examines the hash chain for XYZW. If the chain is
|
|
//! empty, the compressor simply writes out X as a literal byte and advances
|
|
//! one byte in the input. If the hash chain is not empty, indicating that the
|
|
//! sequence XYZW (or, if we are unlucky, some other 4 bytes with the same
|
|
//! hash function value) has occurred recently, the compressor compares all
|
|
//! strings on the XYZW hash chain with the actual input data sequence
|
|
//! starting at the current point, and selects the longest match.
|
|
//!
|
|
//! To improve overall compression, the compressor defers the selection of
|
|
//! matches ("lazy matching"): after a match of length N has been found, the
|
|
//! compressor searches for a longer match starting at the next input byte. If
|
|
//! it finds a longer match, it truncates the previous match to a length of
|
|
//! one (thus producing a single literal byte) and then emits the longer
|
|
//! match. Otherwise, it emits the original match, and, as described above,
|
|
//! advances N bytes before continuing.
|
|
//!
|
|
//!
|
|
//! Allocates statically ~400K (192K lookup, 128K tokens, 64K window).
|
|
const builtin = @import("builtin");
|
|
const std = @import("std");
|
|
const assert = std.debug.assert;
|
|
const testing = std.testing;
|
|
const expect = testing.expect;
|
|
const mem = std.mem;
|
|
const math = std.math;
|
|
const Writer = std.Io.Writer;
|
|
const Reader = std.Io.Reader;
|
|
|
|
const Compress = @This();
|
|
const Token = @import("Token.zig");
|
|
const BlockWriter = @import("BlockWriter.zig");
|
|
const flate = @import("../flate.zig");
|
|
const Container = flate.Container;
|
|
const Lookup = @import("Lookup.zig");
|
|
const huffman = flate.huffman;
|
|
|
|
lookup: Lookup = .{},
|
|
tokens: Tokens = .{},
|
|
/// Asserted to have a buffer capacity of at least `flate.max_window_len`.
|
|
input: *Reader,
|
|
block_writer: BlockWriter,
|
|
level: LevelArgs,
|
|
hasher: Container.Hasher,
|
|
reader: Reader,
|
|
|
|
// Match and literal at the previous position.
|
|
// Used for lazy match finding in processWindow.
|
|
prev_match: ?Token = null,
|
|
prev_literal: ?u8 = null,
|
|
|
|
/// Trades between speed and compression size.
|
|
/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43)
|
|
/// levels 1-3 are using different algorithm to perform faster but with less
|
|
/// compression. That is not implemented here.
|
|
pub const Level = enum(u4) {
|
|
level_4 = 4,
|
|
level_5 = 5,
|
|
level_6 = 6,
|
|
level_7 = 7,
|
|
level_8 = 8,
|
|
level_9 = 9,
|
|
|
|
fast = 0xb,
|
|
default = 0xc,
|
|
best = 0xd,
|
|
};
|
|
|
|
/// Number of tokens to accumulate in deflate before starting block encoding.
|
|
///
|
|
/// In zlib this depends on memlevel: 6 + memlevel, where default memlevel is
|
|
/// 8 and max 9 that gives 14 or 15 bits.
|
|
pub const n_tokens = 1 << 15;
|
|
|
|
/// Algorithm knobs for each level.
|
|
const LevelArgs = struct {
|
|
good: u16, // Do less lookups if we already have match of this length.
|
|
nice: u16, // Stop looking for better match if we found match with at least this length.
|
|
lazy: u16, // Don't do lazy match find if got match with at least this length.
|
|
chain: u16, // How many lookups for previous match to perform.
|
|
|
|
pub fn get(level: Level) LevelArgs {
|
|
return switch (level) {
|
|
.fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 },
|
|
.level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 },
|
|
.default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 },
|
|
.level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 },
|
|
.level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 },
|
|
.best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 },
|
|
};
|
|
}
|
|
};
|
|
|
|
pub const Options = struct {
|
|
level: Level = .default,
|
|
container: Container = .raw,
|
|
};
|
|
|
|
pub fn init(input: *Reader, buffer: []u8, options: Options) Compress {
|
|
return .{
|
|
.input = input,
|
|
.block_writer = undefined,
|
|
.level = .get(options.level),
|
|
.hasher = .init(options.container),
|
|
.state = .header,
|
|
.reader = .{
|
|
.buffer = buffer,
|
|
.stream = stream,
|
|
},
|
|
};
|
|
}
|
|
|
|
const FlushOption = enum { none, flush, final };
|
|
|
|
/// Process data in window and create tokens. If token buffer is full
|
|
/// flush tokens to the token writer.
|
|
///
|
|
/// Returns number of bytes consumed from `lh`.
|
|
fn tokenizeSlice(c: *Compress, bw: *Writer, limit: std.Io.Limit, lh: []const u8) !usize {
|
|
_ = bw;
|
|
_ = limit;
|
|
if (true) @panic("TODO");
|
|
var step: u16 = 1; // 1 in the case of literal, match length otherwise
|
|
const pos: u16 = c.win.pos();
|
|
const literal = lh[0]; // literal at current position
|
|
const min_len: u16 = if (c.prev_match) |m| m.length() else 0;
|
|
|
|
// Try to find match at least min_len long.
|
|
if (c.findMatch(pos, lh, min_len)) |match| {
|
|
// Found better match than previous.
|
|
try c.addPrevLiteral();
|
|
|
|
// Is found match length good enough?
|
|
if (match.length() >= c.level.lazy) {
|
|
// Don't try to lazy find better match, use this.
|
|
step = try c.addMatch(match);
|
|
} else {
|
|
// Store this match.
|
|
c.prev_literal = literal;
|
|
c.prev_match = match;
|
|
}
|
|
} else {
|
|
// There is no better match at current pos then it was previous.
|
|
// Write previous match or literal.
|
|
if (c.prev_match) |m| {
|
|
// Write match from previous position.
|
|
step = try c.addMatch(m) - 1; // we already advanced 1 from previous position
|
|
} else {
|
|
// No match at previous position.
|
|
// Write previous literal if any, and remember this literal.
|
|
try c.addPrevLiteral();
|
|
c.prev_literal = literal;
|
|
}
|
|
}
|
|
// Advance window and add hashes.
|
|
c.windowAdvance(step, lh, pos);
|
|
}
|
|
|
|
fn windowAdvance(self: *Compress, step: u16, lh: []const u8, pos: u16) void {
|
|
// current position is already added in findMatch
|
|
self.lookup.bulkAdd(lh[1..], step - 1, pos + 1);
|
|
self.win.advance(step);
|
|
}
|
|
|
|
// Add previous literal (if any) to the tokens list.
|
|
fn addPrevLiteral(self: *Compress) !void {
|
|
if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l));
|
|
}
|
|
|
|
// Add match to the tokens list, reset prev pointers.
|
|
// Returns length of the added match.
|
|
fn addMatch(self: *Compress, m: Token) !u16 {
|
|
try self.addToken(m);
|
|
self.prev_literal = null;
|
|
self.prev_match = null;
|
|
return m.length();
|
|
}
|
|
|
|
fn addToken(self: *Compress, token: Token) !void {
|
|
self.tokens.add(token);
|
|
if (self.tokens.full()) try self.flushTokens(.none);
|
|
}
|
|
|
|
// Finds largest match in the history window with the data at current pos.
|
|
fn findMatch(self: *Compress, pos: u16, lh: []const u8, min_len: u16) ?Token {
|
|
var len: u16 = min_len;
|
|
// Previous location with the same hash (same 4 bytes).
|
|
var prev_pos = self.lookup.add(lh, pos);
|
|
// Last found match.
|
|
var match: ?Token = null;
|
|
|
|
// How much back-references to try, performance knob.
|
|
var chain: usize = self.level.chain;
|
|
if (len >= self.level.good) {
|
|
// If we've got a match that's good enough, only look in 1/4 the chain.
|
|
chain >>= 2;
|
|
}
|
|
|
|
// Hot path loop!
|
|
while (prev_pos > 0 and chain > 0) : (chain -= 1) {
|
|
const distance = pos - prev_pos;
|
|
if (distance > flate.match.max_distance)
|
|
break;
|
|
|
|
const new_len = self.win.match(prev_pos, pos, len);
|
|
if (new_len > len) {
|
|
match = Token.initMatch(@intCast(distance), new_len);
|
|
if (new_len >= self.level.nice) {
|
|
// The match is good enough that we don't try to find a better one.
|
|
return match;
|
|
}
|
|
len = new_len;
|
|
}
|
|
prev_pos = self.lookup.prev(prev_pos);
|
|
}
|
|
|
|
return match;
|
|
}
|
|
|
|
fn flushTokens(self: *Compress, flush_opt: FlushOption) !void {
|
|
// Pass tokens to the token writer
|
|
try self.block_writer.write(self.tokens.tokens(), flush_opt == .final, self.win.tokensBuffer());
|
|
// Stored block ensures byte alignment.
|
|
// It has 3 bits (final, block_type) and then padding until byte boundary.
|
|
// After that everything is aligned to the boundary in the stored block.
|
|
// Empty stored block is Ob000 + (0-7) bits of padding + 0x00 0x00 0xFF 0xFF.
|
|
// Last 4 bytes are byte aligned.
|
|
if (flush_opt == .flush) {
|
|
try self.block_writer.storedBlock("", false);
|
|
}
|
|
if (flush_opt != .none) {
|
|
// Safe to call only when byte aligned or it is OK to add
|
|
// padding bits (on last byte of the final block).
|
|
try self.block_writer.flush();
|
|
}
|
|
// Reset internal tokens store.
|
|
self.tokens.reset();
|
|
// Notify win that tokens are flushed.
|
|
self.win.flush();
|
|
}
|
|
|
|
// Slide win and if needed lookup tables.
|
|
fn slide(self: *Compress) void {
|
|
const n = self.win.slide();
|
|
self.lookup.slide(n);
|
|
}
|
|
|
|
/// Flushes internal buffers to the output writer. Outputs empty stored
|
|
/// block to sync bit stream to the byte boundary, so that the
|
|
/// decompressor can get all input data available so far.
|
|
///
|
|
/// It is useful mainly in compressed network protocols, to ensure that
|
|
/// deflate bit stream can be used as byte stream. May degrade
|
|
/// compression so it should be used only when necessary.
|
|
///
|
|
/// Completes the current deflate block and follows it with an empty
|
|
/// stored block that is three zero bits plus filler bits to the next
|
|
/// byte, followed by four bytes (00 00 ff ff).
|
|
///
|
|
pub fn flush(c: *Compress) !void {
|
|
try c.tokenize(.flush);
|
|
}
|
|
|
|
/// Completes deflate bit stream by writing any pending data as deflate
|
|
/// final deflate block. HAS to be called once all data are written to
|
|
/// the compressor as a signal that next block has to have final bit
|
|
/// set.
|
|
///
|
|
pub fn finish(c: *Compress) !void {
|
|
_ = c;
|
|
@panic("TODO");
|
|
}
|
|
|
|
/// Use another writer while preserving history. Most probably flush
|
|
/// should be called on old writer before setting new.
|
|
pub fn setWriter(self: *Compress, new_writer: *Writer) void {
|
|
self.block_writer.setWriter(new_writer);
|
|
self.output = new_writer;
|
|
}
|
|
|
|
// Tokens store
|
|
const Tokens = struct {
|
|
list: [n_tokens]Token = undefined,
|
|
pos: usize = 0,
|
|
|
|
fn add(self: *Tokens, t: Token) void {
|
|
self.list[self.pos] = t;
|
|
self.pos += 1;
|
|
}
|
|
|
|
fn full(self: *Tokens) bool {
|
|
return self.pos == self.list.len;
|
|
}
|
|
|
|
fn reset(self: *Tokens) void {
|
|
self.pos = 0;
|
|
}
|
|
|
|
fn tokens(self: *Tokens) []const Token {
|
|
return self.list[0..self.pos];
|
|
}
|
|
};
|
|
|
|
/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and
|
|
/// only performs Huffman entropy encoding. Results in faster compression, much
|
|
/// less memory requirements during compression but bigger compressed sizes.
|
|
pub const Huffman = SimpleCompressor(.huffman, .raw);
|
|
|
|
/// Creates store blocks only. Data are not compressed only packed into deflate
|
|
/// store blocks. That adds 9 bytes of header for each block. Max stored block
|
|
/// size is 64K. Block is emitted when flush is called on on finish.
|
|
pub const store = struct {
|
|
pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
|
|
return SimpleCompressor(.store, container, WriterType);
|
|
}
|
|
|
|
pub fn compressor(comptime container: Container, writer: anytype) !store.Compressor(container, @TypeOf(writer)) {
|
|
return try store.Compressor(container, @TypeOf(writer)).init(writer);
|
|
}
|
|
};
|
|
|
|
const SimpleCompressorKind = enum {
|
|
huffman,
|
|
store,
|
|
};
|
|
|
|
fn simpleCompressor(
|
|
comptime kind: SimpleCompressorKind,
|
|
comptime container: Container,
|
|
writer: anytype,
|
|
) !SimpleCompressor(kind, container, @TypeOf(writer)) {
|
|
return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer);
|
|
}
|
|
|
|
fn SimpleCompressor(
|
|
comptime kind: SimpleCompressorKind,
|
|
comptime container: Container,
|
|
comptime WriterType: type,
|
|
) type {
|
|
const BlockWriterType = BlockWriter(WriterType);
|
|
return struct {
|
|
buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes
|
|
wp: usize = 0,
|
|
|
|
output: WriterType,
|
|
block_writer: BlockWriterType,
|
|
hasher: container.Hasher() = .{},
|
|
|
|
const Self = @This();
|
|
|
|
pub fn init(output: WriterType) !Self {
|
|
const self = Self{
|
|
.output = output,
|
|
.block_writer = BlockWriterType.init(output),
|
|
};
|
|
try container.writeHeader(self.output);
|
|
return self;
|
|
}
|
|
|
|
pub fn flush(self: *Self) !void {
|
|
try self.flushBuffer(false);
|
|
try self.block_writer.storedBlock("", false);
|
|
try self.block_writer.flush();
|
|
}
|
|
|
|
pub fn finish(self: *Self) !void {
|
|
try self.flushBuffer(true);
|
|
try self.block_writer.flush();
|
|
try container.writeFooter(&self.hasher, self.output);
|
|
}
|
|
|
|
fn flushBuffer(self: *Self, final: bool) !void {
|
|
const buf = self.buffer[0..self.wp];
|
|
switch (kind) {
|
|
.huffman => try self.block_writer.huffmanBlock(buf, final),
|
|
.store => try self.block_writer.storedBlock(buf, final),
|
|
}
|
|
self.wp = 0;
|
|
}
|
|
};
|
|
}
|
|
|
|
const LiteralNode = struct {
|
|
literal: u16,
|
|
freq: u16,
|
|
};
|
|
|
|
// Describes the state of the constructed tree for a given depth.
|
|
const LevelInfo = struct {
|
|
// Our level. for better printing
|
|
level: u32,
|
|
|
|
// The frequency of the last node at this level
|
|
last_freq: u32,
|
|
|
|
// The frequency of the next character to add to this level
|
|
next_char_freq: u32,
|
|
|
|
// The frequency of the next pair (from level below) to add to this level.
|
|
// Only valid if the "needed" value of the next lower level is 0.
|
|
next_pair_freq: u32,
|
|
|
|
// The number of chains remaining to generate for this level before moving
|
|
// up to the next level
|
|
needed: u32,
|
|
};
|
|
|
|
// hcode is a huffman code with a bit code and bit length.
|
|
pub const HuffCode = struct {
|
|
code: u16 = 0,
|
|
len: u16 = 0,
|
|
|
|
// set sets the code and length of an hcode.
|
|
fn set(self: *HuffCode, code: u16, length: u16) void {
|
|
self.len = length;
|
|
self.code = code;
|
|
}
|
|
};
|
|
|
|
pub fn HuffmanEncoder(comptime size: usize) type {
|
|
return struct {
|
|
codes: [size]HuffCode = undefined,
|
|
// Reusable buffer with the longest possible frequency table.
|
|
freq_cache: [huffman.max_num_frequencies + 1]LiteralNode = undefined,
|
|
bit_count: [17]u32 = undefined,
|
|
lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate
|
|
lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate
|
|
|
|
const Self = @This();
|
|
|
|
// Update this Huffman Code object to be the minimum code for the specified frequency count.
|
|
//
|
|
// freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
|
|
// max_bits The maximum number of bits to use for any literal.
|
|
pub fn generate(self: *Self, freq: []u16, max_bits: u32) void {
|
|
var list = self.freq_cache[0 .. freq.len + 1];
|
|
// Number of non-zero literals
|
|
var count: u32 = 0;
|
|
// Set list to be the set of all non-zero literals and their frequencies
|
|
for (freq, 0..) |f, i| {
|
|
if (f != 0) {
|
|
list[count] = LiteralNode{ .literal = @as(u16, @intCast(i)), .freq = f };
|
|
count += 1;
|
|
} else {
|
|
list[count] = LiteralNode{ .literal = 0x00, .freq = 0 };
|
|
self.codes[i].len = 0;
|
|
}
|
|
}
|
|
list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 };
|
|
|
|
list = list[0..count];
|
|
if (count <= 2) {
|
|
// Handle the small cases here, because they are awkward for the general case code. With
|
|
// two or fewer literals, everything has bit length 1.
|
|
for (list, 0..) |node, i| {
|
|
// "list" is in order of increasing literal value.
|
|
self.codes[node.literal].set(@as(u16, @intCast(i)), 1);
|
|
}
|
|
return;
|
|
}
|
|
self.lfs = list;
|
|
mem.sort(LiteralNode, self.lfs, {}, byFreq);
|
|
|
|
// Get the number of literals for each bit count
|
|
const bit_count = self.bitCounts(list, max_bits);
|
|
// And do the assignment
|
|
self.assignEncodingAndSize(bit_count, list);
|
|
}
|
|
|
|
pub fn bitLength(self: *Self, freq: []u16) u32 {
|
|
var total: u32 = 0;
|
|
for (freq, 0..) |f, i| {
|
|
if (f != 0) {
|
|
total += @as(u32, @intCast(f)) * @as(u32, @intCast(self.codes[i].len));
|
|
}
|
|
}
|
|
return total;
|
|
}
|
|
|
|
// Return the number of literals assigned to each bit size in the Huffman encoding
|
|
//
|
|
// This method is only called when list.len >= 3
|
|
// The cases of 0, 1, and 2 literals are handled by special case code.
|
|
//
|
|
// list: An array of the literals with non-zero frequencies
|
|
// and their associated frequencies. The array is in order of increasing
|
|
// frequency, and has as its last element a special element with frequency
|
|
// `math.maxInt(i32)`
|
|
//
|
|
// max_bits: The maximum number of bits that should be used to encode any literal.
|
|
// Must be less than 16.
|
|
//
|
|
// Returns an integer array in which array[i] indicates the number of literals
|
|
// that should be encoded in i bits.
|
|
fn bitCounts(self: *Self, list: []LiteralNode, max_bits_to_use: usize) []u32 {
|
|
var max_bits = max_bits_to_use;
|
|
const n = list.len;
|
|
const max_bits_limit = 16;
|
|
|
|
assert(max_bits < max_bits_limit);
|
|
|
|
// The tree can't have greater depth than n - 1, no matter what. This
|
|
// saves a little bit of work in some small cases
|
|
max_bits = @min(max_bits, n - 1);
|
|
|
|
// Create information about each of the levels.
|
|
// A bogus "Level 0" whose sole purpose is so that
|
|
// level1.prev.needed == 0. This makes level1.next_pair_freq
|
|
// be a legitimate value that never gets chosen.
|
|
var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo);
|
|
// leaf_counts[i] counts the number of literals at the left
|
|
// of ancestors of the rightmost node at level i.
|
|
// leaf_counts[i][j] is the number of literals at the left
|
|
// of the level j ancestor.
|
|
var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32);
|
|
|
|
{
|
|
var level = @as(u32, 1);
|
|
while (level <= max_bits) : (level += 1) {
|
|
// For every level, the first two items are the first two characters.
|
|
// We initialize the levels as if we had already figured this out.
|
|
levels[level] = LevelInfo{
|
|
.level = level,
|
|
.last_freq = list[1].freq,
|
|
.next_char_freq = list[2].freq,
|
|
.next_pair_freq = list[0].freq + list[1].freq,
|
|
.needed = 0,
|
|
};
|
|
leaf_counts[level][level] = 2;
|
|
if (level == 1) {
|
|
levels[level].next_pair_freq = math.maxInt(i32);
|
|
}
|
|
}
|
|
}
|
|
|
|
// We need a total of 2*n - 2 items at top level and have already generated 2.
|
|
levels[max_bits].needed = 2 * @as(u32, @intCast(n)) - 4;
|
|
|
|
{
|
|
var level = max_bits;
|
|
while (true) {
|
|
var l = &levels[level];
|
|
if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) {
|
|
// We've run out of both leaves and pairs.
|
|
// End all calculations for this level.
|
|
// To make sure we never come back to this level or any lower level,
|
|
// set next_pair_freq impossibly large.
|
|
l.needed = 0;
|
|
levels[level + 1].next_pair_freq = math.maxInt(i32);
|
|
level += 1;
|
|
continue;
|
|
}
|
|
|
|
const prev_freq = l.last_freq;
|
|
if (l.next_char_freq < l.next_pair_freq) {
|
|
// The next item on this row is a leaf node.
|
|
const next = leaf_counts[level][level] + 1;
|
|
l.last_freq = l.next_char_freq;
|
|
// Lower leaf_counts are the same of the previous node.
|
|
leaf_counts[level][level] = next;
|
|
if (next >= list.len) {
|
|
l.next_char_freq = maxNode().freq;
|
|
} else {
|
|
l.next_char_freq = list[next].freq;
|
|
}
|
|
} else {
|
|
// The next item on this row is a pair from the previous row.
|
|
// next_pair_freq isn't valid until we generate two
|
|
// more values in the level below
|
|
l.last_freq = l.next_pair_freq;
|
|
// Take leaf counts from the lower level, except counts[level] remains the same.
|
|
@memcpy(leaf_counts[level][0..level], leaf_counts[level - 1][0..level]);
|
|
levels[l.level - 1].needed = 2;
|
|
}
|
|
|
|
l.needed -= 1;
|
|
if (l.needed == 0) {
|
|
// We've done everything we need to do for this level.
|
|
// Continue calculating one level up. Fill in next_pair_freq
|
|
// of that level with the sum of the two nodes we've just calculated on
|
|
// this level.
|
|
if (l.level == max_bits) {
|
|
// All done!
|
|
break;
|
|
}
|
|
levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq;
|
|
level += 1;
|
|
} else {
|
|
// If we stole from below, move down temporarily to replenish it.
|
|
while (levels[level - 1].needed > 0) {
|
|
level -= 1;
|
|
if (level == 0) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Somethings is wrong if at the end, the top level is null or hasn't used
|
|
// all of the leaves.
|
|
assert(leaf_counts[max_bits][max_bits] == n);
|
|
|
|
var bit_count = self.bit_count[0 .. max_bits + 1];
|
|
var bits: u32 = 1;
|
|
const counts = &leaf_counts[max_bits];
|
|
{
|
|
var level = max_bits;
|
|
while (level > 0) : (level -= 1) {
|
|
// counts[level] gives the number of literals requiring at least "bits"
|
|
// bits to encode.
|
|
bit_count[bits] = counts[level] - counts[level - 1];
|
|
bits += 1;
|
|
if (level == 0) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return bit_count;
|
|
}
|
|
|
|
// Look at the leaves and assign them a bit count and an encoding as specified
|
|
// in RFC 1951 3.2.2
|
|
fn assignEncodingAndSize(self: *Self, bit_count: []u32, list_arg: []LiteralNode) void {
|
|
var code = @as(u16, 0);
|
|
var list = list_arg;
|
|
|
|
for (bit_count, 0..) |bits, n| {
|
|
code <<= 1;
|
|
if (n == 0 or bits == 0) {
|
|
continue;
|
|
}
|
|
// The literals list[list.len-bits] .. list[list.len-bits]
|
|
// are encoded using "bits" bits, and get the values
|
|
// code, code + 1, .... The code values are
|
|
// assigned in literal order (not frequency order).
|
|
const chunk = list[list.len - @as(u32, @intCast(bits)) ..];
|
|
|
|
self.lns = chunk;
|
|
mem.sort(LiteralNode, self.lns, {}, byLiteral);
|
|
|
|
for (chunk) |node| {
|
|
self.codes[node.literal] = HuffCode{
|
|
.code = bitReverse(u16, code, @as(u5, @intCast(n))),
|
|
.len = @as(u16, @intCast(n)),
|
|
};
|
|
code += 1;
|
|
}
|
|
list = list[0 .. list.len - @as(u32, @intCast(bits))];
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
fn maxNode() LiteralNode {
|
|
return LiteralNode{
|
|
.literal = math.maxInt(u16),
|
|
.freq = math.maxInt(u16),
|
|
};
|
|
}
|
|
|
|
pub fn huffmanEncoder(comptime size: u32) HuffmanEncoder(size) {
|
|
return .{};
|
|
}
|
|
|
|
pub const LiteralEncoder = HuffmanEncoder(huffman.max_num_frequencies);
|
|
pub const DistanceEncoder = HuffmanEncoder(huffman.distance_code_count);
|
|
pub const CodegenEncoder = HuffmanEncoder(19);
|
|
|
|
// Generates a HuffmanCode corresponding to the fixed literal table
|
|
pub fn fixedLiteralEncoder() LiteralEncoder {
|
|
var h: LiteralEncoder = undefined;
|
|
var ch: u16 = 0;
|
|
|
|
while (ch < huffman.max_num_frequencies) : (ch += 1) {
|
|
var bits: u16 = undefined;
|
|
var size: u16 = undefined;
|
|
switch (ch) {
|
|
0...143 => {
|
|
// size 8, 000110000 .. 10111111
|
|
bits = ch + 48;
|
|
size = 8;
|
|
},
|
|
144...255 => {
|
|
// size 9, 110010000 .. 111111111
|
|
bits = ch + 400 - 144;
|
|
size = 9;
|
|
},
|
|
256...279 => {
|
|
// size 7, 0000000 .. 0010111
|
|
bits = ch - 256;
|
|
size = 7;
|
|
},
|
|
else => {
|
|
// size 8, 11000000 .. 11000111
|
|
bits = ch + 192 - 280;
|
|
size = 8;
|
|
},
|
|
}
|
|
h.codes[ch] = HuffCode{ .code = bitReverse(u16, bits, @as(u5, @intCast(size))), .len = size };
|
|
}
|
|
return h;
|
|
}
|
|
|
|
pub fn fixedDistanceEncoder() DistanceEncoder {
|
|
var h: DistanceEncoder = undefined;
|
|
for (h.codes, 0..) |_, ch| {
|
|
h.codes[ch] = HuffCode{ .code = bitReverse(u16, @as(u16, @intCast(ch)), 5), .len = 5 };
|
|
}
|
|
return h;
|
|
}
|
|
|
|
pub fn huffmanDistanceEncoder() DistanceEncoder {
|
|
var distance_freq = [1]u16{0} ** huffman.distance_code_count;
|
|
distance_freq[0] = 1;
|
|
// huff_distance is a static distance encoder used for huffman only encoding.
|
|
// It can be reused since we will not be encoding distance values.
|
|
var h: DistanceEncoder = .{};
|
|
h.generate(distance_freq[0..], 15);
|
|
return h;
|
|
}
|
|
|
|
fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool {
|
|
_ = context;
|
|
return a.literal < b.literal;
|
|
}
|
|
|
|
fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool {
|
|
_ = context;
|
|
if (a.freq == b.freq) {
|
|
return a.literal < b.literal;
|
|
}
|
|
return a.freq < b.freq;
|
|
}
|
|
|
|
fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize {
|
|
const c: *Compress = @fieldParentPtr("reader", r);
|
|
switch (c.state) {
|
|
.header => |i| {
|
|
const header = c.hasher.container().header();
|
|
const n = try w.write(header[i..]);
|
|
if (header.len - i - n == 0) {
|
|
c.state = .middle;
|
|
} else {
|
|
c.state.header += n;
|
|
}
|
|
return n;
|
|
},
|
|
.middle => {
|
|
c.input.fillMore() catch |err| switch (err) {
|
|
error.EndOfStream => {
|
|
c.state = .final;
|
|
return 0;
|
|
},
|
|
else => |e| return e,
|
|
};
|
|
const buffer_contents = c.input.buffered();
|
|
const min_lookahead = flate.match.min_length + flate.match.max_length;
|
|
const history_plus_lookahead_len = flate.history_len + min_lookahead;
|
|
if (buffer_contents.len < history_plus_lookahead_len) return 0;
|
|
const lookahead = buffer_contents[flate.history_len..];
|
|
const start = w.count;
|
|
const n = try c.tokenizeSlice(w, limit, lookahead) catch |err| switch (err) {
|
|
error.WriteFailed => return error.WriteFailed,
|
|
};
|
|
c.hasher.update(lookahead[0..n]);
|
|
c.input.toss(n);
|
|
return w.count - start;
|
|
},
|
|
.final => {
|
|
const buffer_contents = c.input.buffered();
|
|
const start = w.count;
|
|
const n = c.tokenizeSlice(w, limit, buffer_contents) catch |err| switch (err) {
|
|
error.WriteFailed => return error.WriteFailed,
|
|
};
|
|
if (buffer_contents.len - n == 0) {
|
|
c.hasher.update(buffer_contents);
|
|
c.input.tossAll();
|
|
{
|
|
// In the case of flushing, last few lookahead buffers were
|
|
// smaller than min match len, so only last literal can be
|
|
// unwritten.
|
|
assert(c.prev_match == null);
|
|
try c.addPrevLiteral();
|
|
c.prev_literal = null;
|
|
|
|
try c.flushTokens(.final);
|
|
}
|
|
switch (c.hasher) {
|
|
.gzip => |*gzip| {
|
|
// GZIP 8 bytes footer
|
|
// - 4 bytes, CRC32 (CRC-32)
|
|
// - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32
|
|
comptime assert(c.footer_buffer.len == 8);
|
|
std.mem.writeInt(u32, c.footer_buffer[0..4], gzip.final(), .little);
|
|
std.mem.writeInt(u32, c.footer_buffer[4..8], gzip.bytes_read, .little);
|
|
c.state = .{ .footer = 0 };
|
|
},
|
|
.zlib => |*zlib| {
|
|
// ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
|
|
// 4 bytes of ADLER32 (Adler-32 checksum)
|
|
// Checksum value of the uncompressed data (excluding any
|
|
// dictionary data) computed according to Adler-32
|
|
// algorithm.
|
|
comptime assert(c.footer_buffer.len == 8);
|
|
std.mem.writeInt(u32, c.footer_buffer[4..8], zlib.final, .big);
|
|
c.state = .{ .footer = 4 };
|
|
},
|
|
.raw => {
|
|
c.state = .ended;
|
|
},
|
|
}
|
|
}
|
|
return w.count - start;
|
|
},
|
|
.ended => return error.EndOfStream,
|
|
.footer => |i| {
|
|
const remaining = c.footer_buffer[i..];
|
|
const n = try w.write(limit.slice(remaining));
|
|
c.state = if (n == remaining) .ended else .{ .footer = i - n };
|
|
return n;
|
|
},
|
|
}
|
|
}
|
|
|
|
test "generate a Huffman code from an array of frequencies" {
|
|
var freqs: [19]u16 = [_]u16{
|
|
8, // 0
|
|
1, // 1
|
|
1, // 2
|
|
2, // 3
|
|
5, // 4
|
|
10, // 5
|
|
9, // 6
|
|
1, // 7
|
|
0, // 8
|
|
0, // 9
|
|
0, // 10
|
|
0, // 11
|
|
0, // 12
|
|
0, // 13
|
|
0, // 14
|
|
0, // 15
|
|
1, // 16
|
|
3, // 17
|
|
5, // 18
|
|
};
|
|
|
|
var enc = huffmanEncoder(19);
|
|
enc.generate(freqs[0..], 7);
|
|
|
|
try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..]));
|
|
|
|
try testing.expectEqual(@as(usize, 3), enc.codes[0].len);
|
|
try testing.expectEqual(@as(usize, 6), enc.codes[1].len);
|
|
try testing.expectEqual(@as(usize, 6), enc.codes[2].len);
|
|
try testing.expectEqual(@as(usize, 5), enc.codes[3].len);
|
|
try testing.expectEqual(@as(usize, 3), enc.codes[4].len);
|
|
try testing.expectEqual(@as(usize, 2), enc.codes[5].len);
|
|
try testing.expectEqual(@as(usize, 2), enc.codes[6].len);
|
|
try testing.expectEqual(@as(usize, 6), enc.codes[7].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[8].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[9].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[10].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[11].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[12].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[13].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[14].len);
|
|
try testing.expectEqual(@as(usize, 0), enc.codes[15].len);
|
|
try testing.expectEqual(@as(usize, 6), enc.codes[16].len);
|
|
try testing.expectEqual(@as(usize, 5), enc.codes[17].len);
|
|
try testing.expectEqual(@as(usize, 3), enc.codes[18].len);
|
|
|
|
try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code);
|
|
try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code);
|
|
try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code);
|
|
try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code);
|
|
try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code);
|
|
try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code);
|
|
try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code);
|
|
try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code);
|
|
try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code);
|
|
try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code);
|
|
try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code);
|
|
}
|
|
|
|
test "generate a Huffman code for the fixed literal table specific to Deflate" {
|
|
const enc = fixedLiteralEncoder();
|
|
for (enc.codes) |c| {
|
|
switch (c.len) {
|
|
7 => {
|
|
const v = @bitReverse(@as(u7, @intCast(c.code)));
|
|
try testing.expect(v <= 0b0010111);
|
|
},
|
|
8 => {
|
|
const v = @bitReverse(@as(u8, @intCast(c.code)));
|
|
try testing.expect((v >= 0b000110000 and v <= 0b10111111) or
|
|
(v >= 0b11000000 and v <= 11000111));
|
|
},
|
|
9 => {
|
|
const v = @bitReverse(@as(u9, @intCast(c.code)));
|
|
try testing.expect(v >= 0b110010000 and v <= 0b111111111);
|
|
},
|
|
else => unreachable,
|
|
}
|
|
}
|
|
}
|
|
|
|
test "generate a Huffman code for the 30 possible relative distances (LZ77 distances) of Deflate" {
|
|
const enc = fixedDistanceEncoder();
|
|
for (enc.codes) |c| {
|
|
const v = @bitReverse(@as(u5, @intCast(c.code)));
|
|
try testing.expect(v <= 29);
|
|
try testing.expect(c.len == 5);
|
|
}
|
|
}
|
|
|
|
// Reverse bit-by-bit a N-bit code.
|
|
fn bitReverse(comptime T: type, value: T, n: usize) T {
|
|
const r = @bitReverse(value);
|
|
return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).int.bits - n));
|
|
}
|
|
|
|
test bitReverse {
|
|
const ReverseBitsTest = struct {
|
|
in: u16,
|
|
bit_count: u5,
|
|
out: u16,
|
|
};
|
|
|
|
const reverse_bits_tests = [_]ReverseBitsTest{
|
|
.{ .in = 1, .bit_count = 1, .out = 1 },
|
|
.{ .in = 1, .bit_count = 2, .out = 2 },
|
|
.{ .in = 1, .bit_count = 3, .out = 4 },
|
|
.{ .in = 1, .bit_count = 4, .out = 8 },
|
|
.{ .in = 1, .bit_count = 5, .out = 16 },
|
|
.{ .in = 17, .bit_count = 5, .out = 17 },
|
|
.{ .in = 257, .bit_count = 9, .out = 257 },
|
|
.{ .in = 29, .bit_count = 5, .out = 23 },
|
|
};
|
|
|
|
for (reverse_bits_tests) |h| {
|
|
const v = bitReverse(u16, h.in, h.bit_count);
|
|
try std.testing.expectEqual(h.out, v);
|
|
}
|
|
}
|
|
|
|
test "fixedLiteralEncoder codes" {
|
|
var al = std.ArrayList(u8).init(testing.allocator);
|
|
defer al.deinit();
|
|
var bw = std.Io.bitWriter(.little, al.writer());
|
|
|
|
const f = fixedLiteralEncoder();
|
|
for (f.codes) |c| {
|
|
try bw.writeBits(c.code, c.len);
|
|
}
|
|
try testing.expectEqualSlices(u8, &fixed_codes, al.items);
|
|
}
|
|
|
|
pub const fixed_codes = [_]u8{
|
|
0b00001100, 0b10001100, 0b01001100, 0b11001100, 0b00101100, 0b10101100, 0b01101100, 0b11101100,
|
|
0b00011100, 0b10011100, 0b01011100, 0b11011100, 0b00111100, 0b10111100, 0b01111100, 0b11111100,
|
|
0b00000010, 0b10000010, 0b01000010, 0b11000010, 0b00100010, 0b10100010, 0b01100010, 0b11100010,
|
|
0b00010010, 0b10010010, 0b01010010, 0b11010010, 0b00110010, 0b10110010, 0b01110010, 0b11110010,
|
|
0b00001010, 0b10001010, 0b01001010, 0b11001010, 0b00101010, 0b10101010, 0b01101010, 0b11101010,
|
|
0b00011010, 0b10011010, 0b01011010, 0b11011010, 0b00111010, 0b10111010, 0b01111010, 0b11111010,
|
|
0b00000110, 0b10000110, 0b01000110, 0b11000110, 0b00100110, 0b10100110, 0b01100110, 0b11100110,
|
|
0b00010110, 0b10010110, 0b01010110, 0b11010110, 0b00110110, 0b10110110, 0b01110110, 0b11110110,
|
|
0b00001110, 0b10001110, 0b01001110, 0b11001110, 0b00101110, 0b10101110, 0b01101110, 0b11101110,
|
|
0b00011110, 0b10011110, 0b01011110, 0b11011110, 0b00111110, 0b10111110, 0b01111110, 0b11111110,
|
|
0b00000001, 0b10000001, 0b01000001, 0b11000001, 0b00100001, 0b10100001, 0b01100001, 0b11100001,
|
|
0b00010001, 0b10010001, 0b01010001, 0b11010001, 0b00110001, 0b10110001, 0b01110001, 0b11110001,
|
|
0b00001001, 0b10001001, 0b01001001, 0b11001001, 0b00101001, 0b10101001, 0b01101001, 0b11101001,
|
|
0b00011001, 0b10011001, 0b01011001, 0b11011001, 0b00111001, 0b10111001, 0b01111001, 0b11111001,
|
|
0b00000101, 0b10000101, 0b01000101, 0b11000101, 0b00100101, 0b10100101, 0b01100101, 0b11100101,
|
|
0b00010101, 0b10010101, 0b01010101, 0b11010101, 0b00110101, 0b10110101, 0b01110101, 0b11110101,
|
|
0b00001101, 0b10001101, 0b01001101, 0b11001101, 0b00101101, 0b10101101, 0b01101101, 0b11101101,
|
|
0b00011101, 0b10011101, 0b01011101, 0b11011101, 0b00111101, 0b10111101, 0b01111101, 0b11111101,
|
|
0b00010011, 0b00100110, 0b01001110, 0b10011010, 0b00111100, 0b01100101, 0b11101010, 0b10110100,
|
|
0b11101001, 0b00110011, 0b01100110, 0b11001110, 0b10011010, 0b00111101, 0b01100111, 0b11101110,
|
|
0b10111100, 0b11111001, 0b00001011, 0b00010110, 0b00101110, 0b01011010, 0b10111100, 0b01100100,
|
|
0b11101001, 0b10110010, 0b11100101, 0b00101011, 0b01010110, 0b10101110, 0b01011010, 0b10111101,
|
|
0b01100110, 0b11101101, 0b10111010, 0b11110101, 0b00011011, 0b00110110, 0b01101110, 0b11011010,
|
|
0b10111100, 0b01100101, 0b11101011, 0b10110110, 0b11101101, 0b00111011, 0b01110110, 0b11101110,
|
|
0b11011010, 0b10111101, 0b01100111, 0b11101111, 0b10111110, 0b11111101, 0b00000111, 0b00001110,
|
|
0b00011110, 0b00111010, 0b01111100, 0b11100100, 0b11101000, 0b10110001, 0b11100011, 0b00100111,
|
|
0b01001110, 0b10011110, 0b00111010, 0b01111101, 0b11100110, 0b11101100, 0b10111001, 0b11110011,
|
|
0b00010111, 0b00101110, 0b01011110, 0b10111010, 0b01111100, 0b11100101, 0b11101010, 0b10110101,
|
|
0b11101011, 0b00110111, 0b01101110, 0b11011110, 0b10111010, 0b01111101, 0b11100111, 0b11101110,
|
|
0b10111101, 0b11111011, 0b00001111, 0b00011110, 0b00111110, 0b01111010, 0b11111100, 0b11100100,
|
|
0b11101001, 0b10110011, 0b11100111, 0b00101111, 0b01011110, 0b10111110, 0b01111010, 0b11111101,
|
|
0b11100110, 0b11101101, 0b10111011, 0b11110111, 0b00011111, 0b00111110, 0b01111110, 0b11111010,
|
|
0b11111100, 0b11100101, 0b11101011, 0b10110111, 0b11101111, 0b00111111, 0b01111110, 0b11111110,
|
|
0b11111010, 0b11111101, 0b11100111, 0b11101111, 0b10111111, 0b11111111, 0b00000000, 0b00100000,
|
|
0b00001000, 0b00001100, 0b10000001, 0b11000010, 0b11100000, 0b00001000, 0b00100100, 0b00001010,
|
|
0b10001101, 0b11000001, 0b11100010, 0b11110000, 0b00000100, 0b00100010, 0b10001001, 0b01001100,
|
|
0b10100001, 0b11010010, 0b11101000, 0b00000011, 0b10000011, 0b01000011, 0b11000011, 0b00100011,
|
|
0b10100011,
|
|
};
|
|
|
|
test "tokenization" {
|
|
const L = Token.initLiteral;
|
|
const M = Token.initMatch;
|
|
|
|
const cases = [_]struct {
|
|
data: []const u8,
|
|
tokens: []const Token,
|
|
}{
|
|
.{
|
|
.data = "Blah blah blah blah blah!",
|
|
.tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') },
|
|
},
|
|
.{
|
|
.data = "ABCDEABCD ABCDEABCD",
|
|
.tokens = &[_]Token{
|
|
L('A'), L('B'), L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '),
|
|
L('A'), M(10, 8),
|
|
},
|
|
},
|
|
};
|
|
|
|
for (cases) |c| {
|
|
inline for (Container.list) |container| { // for each wrapping
|
|
|
|
var cw = std.Io.countingWriter(std.Io.null_writer);
|
|
const cww = cw.writer();
|
|
var df = try Compress(container, @TypeOf(cww), TestTokenWriter).init(cww, .{});
|
|
|
|
_ = try df.write(c.data);
|
|
try df.flush();
|
|
|
|
// df.token_writer.show();
|
|
try expect(df.block_writer.pos == c.tokens.len); // number of tokens written
|
|
try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match
|
|
|
|
try testing.expectEqual(container.headerSize(), cw.bytes_written);
|
|
try df.finish();
|
|
try testing.expectEqual(container.size(), cw.bytes_written);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Tests that tokens written are equal to expected token list.
|
|
const TestTokenWriter = struct {
|
|
const Self = @This();
|
|
|
|
pos: usize = 0,
|
|
actual: [128]Token = undefined,
|
|
|
|
pub fn init(_: anytype) Self {
|
|
return .{};
|
|
}
|
|
pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
|
|
for (tokens) |t| {
|
|
self.actual[self.pos] = t;
|
|
self.pos += 1;
|
|
}
|
|
}
|
|
|
|
pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
|
|
|
|
pub fn get(self: *Self) []Token {
|
|
return self.actual[0..self.pos];
|
|
}
|
|
|
|
pub fn show(self: *Self) void {
|
|
std.debug.print("\n", .{});
|
|
for (self.get()) |t| {
|
|
t.show();
|
|
}
|
|
}
|
|
|
|
pub fn flush(_: *Self) !void {}
|
|
};
|
|
|
|
test "file tokenization" {
|
|
const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
|
|
const cases = [_]struct {
|
|
data: []const u8, // uncompressed content
|
|
// expected number of tokens producet in deflate tokenization
|
|
tokens_count: [levels.len]usize = .{0} ** levels.len,
|
|
}{
|
|
.{
|
|
.data = @embedFile("testdata/rfc1951.txt"),
|
|
.tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 },
|
|
},
|
|
|
|
.{
|
|
.data = @embedFile("testdata/block_writer/huffman-null-max.input"),
|
|
.tokens_count = .{ 257, 257, 257, 257, 257, 257 },
|
|
},
|
|
.{
|
|
.data = @embedFile("testdata/block_writer/huffman-pi.input"),
|
|
.tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 },
|
|
},
|
|
.{
|
|
.data = @embedFile("testdata/block_writer/huffman-text.input"),
|
|
.tokens_count = .{ 235, 234, 234, 234, 234, 234 },
|
|
},
|
|
.{
|
|
.data = @embedFile("testdata/fuzz/roundtrip1.input"),
|
|
.tokens_count = .{ 333, 331, 331, 331, 331, 331 },
|
|
},
|
|
.{
|
|
.data = @embedFile("testdata/fuzz/roundtrip2.input"),
|
|
.tokens_count = .{ 334, 334, 334, 334, 334, 334 },
|
|
},
|
|
};
|
|
|
|
for (cases) |case| { // for each case
|
|
const data = case.data;
|
|
|
|
for (levels, 0..) |level, i| { // for each compression level
|
|
var original: Reader = .fixed(data);
|
|
|
|
// buffer for decompressed data
|
|
var al = std.ArrayList(u8).init(testing.allocator);
|
|
defer al.deinit();
|
|
const writer = al.writer();
|
|
|
|
// create compressor
|
|
const WriterType = @TypeOf(writer);
|
|
const TokenWriter = TokenDecoder(@TypeOf(writer));
|
|
var cmp = try Compress(.raw, WriterType, TokenWriter).init(writer, .{ .level = level });
|
|
|
|
// Stream uncompressed `original` data to the compressor. It will
|
|
// produce tokens list and pass that list to the TokenDecoder. This
|
|
// TokenDecoder uses CircularBuffer from inflate to convert list of
|
|
// tokens back to the uncompressed stream.
|
|
try cmp.compress(original.reader());
|
|
try cmp.flush();
|
|
const expected_count = case.tokens_count[i];
|
|
const actual = cmp.block_writer.tokens_count;
|
|
if (expected_count == 0) {
|
|
std.debug.print("actual token count {d}\n", .{actual});
|
|
} else {
|
|
try testing.expectEqual(expected_count, actual);
|
|
}
|
|
|
|
try testing.expectEqual(data.len, al.items.len);
|
|
try testing.expectEqualSlices(u8, data, al.items);
|
|
}
|
|
}
|
|
}
|
|
|
|
const TokenDecoder = struct {
|
|
output: *Writer,
|
|
tokens_count: usize,
|
|
|
|
pub fn init(output: *Writer) TokenDecoder {
|
|
return .{
|
|
.output = output,
|
|
.tokens_count = 0,
|
|
};
|
|
}
|
|
|
|
pub fn write(self: *TokenDecoder, tokens: []const Token, _: bool, _: ?[]const u8) !void {
|
|
self.tokens_count += tokens.len;
|
|
for (tokens) |t| {
|
|
switch (t.kind) {
|
|
.literal => self.hist.write(t.literal()),
|
|
.match => try self.hist.writeMatch(t.length(), t.distance()),
|
|
}
|
|
if (self.hist.free() < 285) try self.flushWin();
|
|
}
|
|
try self.flushWin();
|
|
}
|
|
|
|
fn flushWin(self: *TokenDecoder) !void {
|
|
while (true) {
|
|
const buf = self.hist.read();
|
|
if (buf.len == 0) break;
|
|
try self.output.writeAll(buf);
|
|
}
|
|
}
|
|
};
|
|
|
|
test "store simple compressor" {
|
|
const data = "Hello world!";
|
|
const expected = [_]u8{
|
|
0x1, // block type 0, final bit set
|
|
0xc, 0x0, // len = 12
|
|
0xf3, 0xff, // ~len
|
|
'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
|
|
//0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
|
|
};
|
|
|
|
var fbs: Reader = .fixed(data);
|
|
var al = std.ArrayList(u8).init(testing.allocator);
|
|
defer al.deinit();
|
|
|
|
var cmp = try store.compressor(.raw, al.writer());
|
|
try cmp.compress(&fbs);
|
|
try cmp.finish();
|
|
try testing.expectEqualSlices(u8, &expected, al.items);
|
|
|
|
fbs = .fixed(data);
|
|
try al.resize(0);
|
|
|
|
// huffman only compresoor will also emit store block for this small sample
|
|
var hc = try huffman.compressor(.raw, al.writer());
|
|
try hc.compress(&fbs);
|
|
try hc.finish();
|
|
try testing.expectEqualSlices(u8, &expected, al.items);
|
|
}
|
|
|
|
test "sliding window match" {
|
|
const data = "Blah blah blah blah blah!";
|
|
var win: Writer = .{};
|
|
try expect(win.write(data) == data.len);
|
|
try expect(win.wp == data.len);
|
|
try expect(win.rp == 0);
|
|
|
|
// length between l symbols
|
|
try expect(win.match(1, 6, 0) == 18);
|
|
try expect(win.match(1, 11, 0) == 13);
|
|
try expect(win.match(1, 16, 0) == 8);
|
|
try expect(win.match(1, 21, 0) == 0);
|
|
|
|
// position 15 = "blah blah!"
|
|
// position 20 = "blah!"
|
|
try expect(win.match(15, 20, 0) == 4);
|
|
try expect(win.match(15, 20, 3) == 4);
|
|
try expect(win.match(15, 20, 4) == 0);
|
|
}
|
|
|
|
test "sliding window slide" {
|
|
var win: Writer = .{};
|
|
win.wp = Writer.buffer_len - 11;
|
|
win.rp = Writer.buffer_len - 111;
|
|
win.buffer[win.rp] = 0xab;
|
|
try expect(win.lookahead().len == 100);
|
|
try expect(win.tokensBuffer().?.len == win.rp);
|
|
|
|
const n = win.slide();
|
|
try expect(n == 32757);
|
|
try expect(win.buffer[win.rp] == 0xab);
|
|
try expect(win.rp == Writer.hist_len - 111);
|
|
try expect(win.wp == Writer.hist_len - 11);
|
|
try expect(win.lookahead().len == 100);
|
|
try expect(win.tokensBuffer() == null);
|
|
}
|