zig/lib/std/compress/flate/Compress.zig

//! Default compression algorithm. Has two steps: tokenization and token
//! encoding.
//!
//! Tokenization takes uncompressed input stream and produces list of tokens.
//! Each token can be literal (byte of data) or match (backrefernce to previous
//! data with length and distance). Tokenization accumulators 32K tokens, when
//! full or `flush` is called tokens are passed to the `block_writer`. Level
//! defines how hard (how slow) it tries to find match.
//!
//! Block writer will decide which type of deflate block to write (stored, fixed,
//! dynamic) and encode tokens to the output byte stream. Client has to call
//! `finish` to write block with the final bit set.
//!
//! Container defines type of header and footer which can be gzip, zlib or raw.
//! They all share same deflate body. Raw has no header or footer just deflate
//! body.
//!
//! Compression algorithm explained in rfc-1951 (slightly edited for this case):
//!
//!   The compressor uses a chained hash table `lookup` to find duplicated
//!   strings, using a hash function that operates on 4-byte sequences. At any
//!   given point during compression, let XYZW be the next 4 input bytes
//!   (lookahead) to be examined (not necessarily all different, of course).
//!   First, the compressor examines the hash chain for XYZW. If the chain is
//!   empty, the compressor simply writes out X as a literal byte and advances
//!   one byte in the input. If the hash chain is not empty, indicating that the
//!   sequence XYZW (or, if we are unlucky, some other 4 bytes with the same
//!   hash function value) has occurred recently, the compressor compares all
//!   strings on the XYZW hash chain with the actual input data sequence
//!   starting at the current point, and selects the longest match.
//!
//!   To improve overall compression, the compressor defers the selection of
//!   matches ("lazy matching"): after a match of length N has been found, the
//!   compressor searches for a longer match starting at the next input byte. If
//!   it finds a longer match, it truncates the previous match to a length of
//!   one (thus producing a single literal byte) and then emits the longer
//!   match. Otherwise, it emits the original match, and, as described above,
//!   advances N bytes before continuing.
//!
//!
//! Allocates statically ~400K (192K lookup, 128K tokens, 64K window).

const builtin = @import("builtin");
const std = @import("std");
const assert = std.debug.assert;
const testing = std.testing;
const expect = testing.expect;
const mem = std.mem;
const math = std.math;
const Writer = std.Io.Writer;

const Compress = @This();
const Token = @import("Token.zig");
const BlockWriter = @import("BlockWriter.zig");
const flate = @import("../flate.zig");
const Container = flate.Container;
const Lookup = @import("Lookup.zig");
const HuffmanEncoder = flate.HuffmanEncoder;
const LiteralNode = HuffmanEncoder.LiteralNode;

lookup: Lookup = .{},
tokens: Tokens = .{},
block_writer: BlockWriter,
level: LevelArgs,
hasher: Container.Hasher,
writer: Writer,
state: State,

// Match and literal at the previous position.
// Used for lazy match finding in processWindow.
prev_match: ?Token = null,
prev_literal: ?u8 = null,

pub const State = enum { header, middle, ended };

/// Trades between speed and compression size.
/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43)
/// levels 1-3 are using different algorithm to perform faster but with less
/// compression. That is not implemented here.
pub const Level = enum(u4) {
    level_4 = 4,
    level_5 = 5,
    level_6 = 6,
    level_7 = 7,
    level_8 = 8,
    level_9 = 9,

    fast = 0xb,
    default = 0xc,
    best = 0xd,
};

/// Number of tokens to accumulate in deflate before starting block encoding.
///
/// In zlib this depends on memlevel: 6 + memlevel, where default memlevel is
/// 8 and max 9 that gives 14 or 15 bits.
pub const n_tokens = 1 << 15;

/// Algorithm knobs for each level.
const LevelArgs = struct {
    good: u16, // Do less lookups if we already have match of this length.
    nice: u16, // Stop looking for better match if we found match with at least this length.
    lazy: u16, // Don't do lazy match find if got match with at least this length.
    chain: u16, // How many lookups for previous match to perform.

    pub fn get(level: Level) LevelArgs {
        return switch (level) {
            .fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 },
            .level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 },
            .default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 },
            .level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 },
            .level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 },
            .best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 },
        };
    }
};

pub const Options = struct {
    level: Level = .default,
    container: Container = .raw,
};

pub fn init(output: *Writer, buffer: []u8, options: Options) Compress {
    return .{
        .block_writer = .{
            .output = output,
            .codegen_freq = undefined,
            .literal_freq = undefined,
            .distance_freq = undefined,
            .codegen = undefined,
            .literal_encoding = undefined,
            .distance_encoding = undefined,
            .codegen_encoding = undefined,
            .fixed_literal_encoding = undefined,
            .fixed_distance_encoding = undefined,
            .huff_distance = undefined,
            .fixed_literal_codes = undefined,
            .fixed_distance_codes = undefined,
            .distance_codes = undefined,
        },
        .level = .get(options.level),
        .hasher = .init(options.container),
        .state = .header,
        .writer = .{
            .buffer = buffer,
            .vtable = &.{ .drain = drain },
        },
    };
}

// Tokens store
const Tokens = struct {
    list: [n_tokens]Token = undefined,
    pos: usize = 0,

    fn add(self: *Tokens, t: Token) void {
        self.list[self.pos] = t;
        self.pos += 1;
    }

    fn full(self: *Tokens) bool {
        return self.pos == self.list.len;
    }

    fn reset(self: *Tokens) void {
        self.pos = 0;
    }

    fn tokens(self: *Tokens) []const Token {
        return self.list[0..self.pos];
    }
};

fn drain(me: *Writer, data: []const []const u8, splat: usize) Writer.Error!usize {
    _ = data;
    _ = splat;
    const c: *Compress = @fieldParentPtr("writer", me);
    const out = c.block_writer.output;
    switch (c.state) {
        .header => {
            c.state = .middle;
            const header = c.hasher.container().header();
            try out.writeAll(header);
            return header.len;
        },
        .middle => {},
        .ended => unreachable,
    }

    const buffered = me.buffered();
    const min_lookahead = flate.match.min_length + flate.match.max_length;
    const history_plus_lookahead_len = flate.history_len + min_lookahead;
    if (buffered.len < history_plus_lookahead_len) return 0;
    const lookahead = buffered[flate.history_len..];

    _ = lookahead;
    // TODO tokenize
    //c.hasher.update(lookahead[0..n]);
    @panic("TODO");
}

pub fn end(c: *Compress) !void {
    try endUnflushed(c);
    try c.output.flush();
}

pub fn endUnflushed(c: *Compress) !void {
    while (c.writer.end != 0) _ = try drain(&c.writer, &.{""}, 1);
    c.state = .ended;

    const out = c.block_writer.output;

    // TODO flush tokens

    switch (c.hasher) {
        .gzip => |*gzip| {
            // GZIP 8 bytes footer
            //  - 4 bytes, CRC32 (CRC-32)
            //  - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32
            const footer = try out.writableArray(8);
            std.mem.writeInt(u32, footer[0..4], gzip.crc.final(), .little);
            std.mem.writeInt(u32, footer[4..8], @truncate(gzip.count), .little);
        },
        .zlib => |*zlib| {
            // ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
            // 4 bytes of ADLER32 (Adler-32 checksum)
            // Checksum value of the uncompressed data (excluding any
            // dictionary data) computed according to Adler-32
            // algorithm.
            std.mem.writeInt(u32, try out.writableArray(4), zlib.final, .big);
        },
        .raw => {},
    }
}

pub const Simple = struct {
    /// Note that store blocks are limited to 65535 bytes.
    buffer: []u8,
    wp: usize,
    block_writer: BlockWriter,
    hasher: Container.Hasher,
    strategy: Strategy,

    pub const Strategy = enum { huffman, store };

    pub fn init(out: *Writer, buffer: []u8, container: Container) !Simple {
        const self: Simple = .{
            .buffer = buffer,
            .wp = 0,
            .block_writer = .init(out),
            .hasher = .init(container),
        };
        try container.writeHeader(self.out);
        return self;
    }

    pub fn flush(self: *Simple) !void {
        try self.flushBuffer(false);
        try self.block_writer.storedBlock("", false);
        try self.block_writer.flush();
    }

    pub fn finish(self: *Simple) !void {
        try self.flushBuffer(true);
        try self.block_writer.flush();
        try self.hasher.container().writeFooter(&self.hasher, self.out);
    }

    fn flushBuffer(self: *Simple, final: bool) !void {
        const buf = self.buffer[0..self.wp];
        switch (self.strategy) {
            .huffman => try self.block_writer.huffmanBlock(buf, final),
            .store => try self.block_writer.storedBlock(buf, final),
        }
        self.wp = 0;
    }
};

test "generate a Huffman code from an array of frequencies" {
    var freqs: [19]u16 = [_]u16{
        8, // 0
        1, // 1
        1, // 2
        2, // 3
        5, // 4
        10, // 5
        9, // 6
        1, // 7
        0, // 8
        0, // 9
        0, // 10
        0, // 11
        0, // 12
        0, // 13
        0, // 14
        0, // 15
        1, // 16
        3, // 17
        5, // 18
    };

    var codes: [19]HuffmanEncoder.Code = undefined;
    var enc: HuffmanEncoder = .{ .codes = &codes };
    enc.generate(freqs[0..], 7);

    try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..]));

    try testing.expectEqual(@as(usize, 3), enc.codes[0].len);
    try testing.expectEqual(@as(usize, 6), enc.codes[1].len);
    try testing.expectEqual(@as(usize, 6), enc.codes[2].len);
    try testing.expectEqual(@as(usize, 5), enc.codes[3].len);
    try testing.expectEqual(@as(usize, 3), enc.codes[4].len);
    try testing.expectEqual(@as(usize, 2), enc.codes[5].len);
    try testing.expectEqual(@as(usize, 2), enc.codes[6].len);
    try testing.expectEqual(@as(usize, 6), enc.codes[7].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[8].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[9].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[10].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[11].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[12].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[13].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[14].len);
    try testing.expectEqual(@as(usize, 0), enc.codes[15].len);
    try testing.expectEqual(@as(usize, 6), enc.codes[16].len);
    try testing.expectEqual(@as(usize, 5), enc.codes[17].len);
    try testing.expectEqual(@as(usize, 3), enc.codes[18].len);

    try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code);
    try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code);
    try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code);
    try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code);
    try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code);
    try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code);
    try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code);
    try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code);
    try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code);
    try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code);
    try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code);
}

test "tokenization" {
    const L = Token.initLiteral;
    const M = Token.initMatch;

    const cases = [_]struct {
        data: []const u8,
        tokens: []const Token,
    }{
        .{
            .data = "Blah blah blah blah blah!",
            .tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') },
        },
        .{
            .data = "ABCDEABCD ABCDEABCD",
            .tokens = &[_]Token{
                L('A'), L('B'),   L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '),
                L('A'), M(10, 8),
            },
        },
    };

    for (cases) |c| {
        inline for (Container.list) |container| { // for each wrapping

            var cw = std.Io.countingWriter(std.Io.null_writer);
            const cww = cw.writer();
            var df = try Compress(container, @TypeOf(cww), TestTokenWriter).init(cww, .{});

            _ = try df.write(c.data);
            try df.flush();

            // df.token_writer.show();
            try expect(df.block_writer.pos == c.tokens.len); // number of tokens written
            try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match

            try testing.expectEqual(container.headerSize(), cw.bytes_written);
            try df.finish();
            try testing.expectEqual(container.size(), cw.bytes_written);
        }
    }
}

// Tests that tokens written are equal to expected token list.
const TestTokenWriter = struct {
    const Self = @This();

    pos: usize = 0,
    actual: [128]Token = undefined,

    pub fn init(_: anytype) Self {
        return .{};
    }
    pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
        for (tokens) |t| {
            self.actual[self.pos] = t;
            self.pos += 1;
        }
    }

    pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}

    pub fn get(self: *Self) []Token {
        return self.actual[0..self.pos];
    }

    pub fn show(self: *Self) void {
        std.debug.print("\n", .{});
        for (self.get()) |t| {
            t.show();
        }
    }

    pub fn flush(_: *Self) !void {}
};

test "file tokenization" {
    const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
    const cases = [_]struct {
        data: []const u8, // uncompressed content
        // expected number of tokens producet in deflate tokenization
        tokens_count: [levels.len]usize = .{0} ** levels.len,
    }{
        .{
            .data = @embedFile("testdata/rfc1951.txt"),
            .tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 },
        },

        .{
            .data = @embedFile("testdata/block_writer/huffman-null-max.input"),
            .tokens_count = .{ 257, 257, 257, 257, 257, 257 },
        },
        .{
            .data = @embedFile("testdata/block_writer/huffman-pi.input"),
            .tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 },
        },
        .{
            .data = @embedFile("testdata/block_writer/huffman-text.input"),
            .tokens_count = .{ 235, 234, 234, 234, 234, 234 },
        },
        .{
            .data = @embedFile("testdata/fuzz/roundtrip1.input"),
            .tokens_count = .{ 333, 331, 331, 331, 331, 331 },
        },
        .{
            .data = @embedFile("testdata/fuzz/roundtrip2.input"),
            .tokens_count = .{ 334, 334, 334, 334, 334, 334 },
        },
    };

    for (cases) |case| { // for each case
        const data = case.data;

        for (levels, 0..) |level, i| { // for each compression level
            var original: std.Io.Reader = .fixed(data);

            // buffer for decompressed data
            var al = std.ArrayList(u8).init(testing.allocator);
            defer al.deinit();
            const writer = al.writer();

            // create compressor
            const WriterType = @TypeOf(writer);
            const TokenWriter = TokenDecoder(@TypeOf(writer));
            var cmp = try Compress(.raw, WriterType, TokenWriter).init(writer, .{ .level = level });

            // Stream uncompressed `original` data to the compressor. It will
            // produce tokens list and pass that list to the TokenDecoder. This
            // TokenDecoder uses CircularBuffer from inflate to convert list of
            // tokens back to the uncompressed stream.
            try cmp.compress(original.reader());
            try cmp.flush();
            const expected_count = case.tokens_count[i];
            const actual = cmp.block_writer.tokens_count;
            if (expected_count == 0) {
                std.debug.print("actual token count {d}\n", .{actual});
            } else {
                try testing.expectEqual(expected_count, actual);
            }

            try testing.expectEqual(data.len, al.items.len);
            try testing.expectEqualSlices(u8, data, al.items);
        }
    }
}

const TokenDecoder = struct {
    output: *Writer,
    tokens_count: usize,

    pub fn init(output: *Writer) TokenDecoder {
        return .{
            .output = output,
            .tokens_count = 0,
        };
    }

    pub fn write(self: *TokenDecoder, tokens: []const Token, _: bool, _: ?[]const u8) !void {
        self.tokens_count += tokens.len;
        for (tokens) |t| {
            switch (t.kind) {
                .literal => self.hist.write(t.literal()),
                .match => try self.hist.writeMatch(t.length(), t.distance()),
            }
            if (self.hist.free() < 285) try self.flushWin();
        }
        try self.flushWin();
    }

    fn flushWin(self: *TokenDecoder) !void {
        while (true) {
            const buf = self.hist.read();
            if (buf.len == 0) break;
            try self.output.writeAll(buf);
        }
    }
};

test "store simple compressor" {
    if (true) return error.SkipZigTest;
    //const data = "Hello world!";
    //const expected = [_]u8{
    //    0x1, // block type 0, final bit set
    //    0xc, 0x0, // len = 12
    //    0xf3, 0xff, // ~len
    //    'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
    //    //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
    //};

    //var fbs: std.Io.Reader = .fixed(data);
    //var al = std.ArrayList(u8).init(testing.allocator);
    //defer al.deinit();

    //var cmp = try store.compressor(.raw, al.writer());
    //try cmp.compress(&fbs);
    //try cmp.finish();
    //try testing.expectEqualSlices(u8, &expected, al.items);

    //fbs = .fixed(data);
    //try al.resize(0);

    //// huffman only compresoor will also emit store block for this small sample
    //var hc = try huffman.compressor(.raw, al.writer());
    //try hc.compress(&fbs);
    //try hc.finish();
    //try testing.expectEqualSlices(u8, &expected, al.items);
}

test "sliding window match" {
    const data = "Blah blah blah blah blah!";
    var win: Writer = .{};
    try expect(win.write(data) == data.len);
    try expect(win.wp == data.len);
    try expect(win.rp == 0);

    // length between l symbols
    try expect(win.match(1, 6, 0) == 18);
    try expect(win.match(1, 11, 0) == 13);
    try expect(win.match(1, 16, 0) == 8);
    try expect(win.match(1, 21, 0) == 0);

    // position 15 = "blah blah!"
    // position 20 = "blah!"
    try expect(win.match(15, 20, 0) == 4);
    try expect(win.match(15, 20, 3) == 4);
    try expect(win.match(15, 20, 4) == 0);
}

test "sliding window slide" {
    var win: Writer = .{};
    win.wp = Writer.buffer_len - 11;
    win.rp = Writer.buffer_len - 111;
    win.buffer[win.rp] = 0xab;
    try expect(win.lookahead().len == 100);
    try expect(win.tokensBuffer().?.len == win.rp);

    const n = win.slide();
    try expect(n == 32757);
    try expect(win.buffer[win.rp] == 0xab);
    try expect(win.rp == Writer.hist_len - 111);
    try expect(win.wp == Writer.hist_len - 11);
    try expect(win.lookahead().len == 100);
    try expect(win.tokensBuffer() == null);
}