diff --git a/lib/std/compress.zig b/lib/std/compress.zig index 7a680a5446..199d046af6 100644 --- a/lib/std/compress.zig +++ b/lib/std/compress.zig @@ -1,19 +1,16 @@ //! Compression algorithms. +/// gzip and zlib are here. pub const flate = @import("compress/flate.zig"); -pub const gzip = @import("compress/gzip.zig"); pub const lzma = @import("compress/lzma.zig"); pub const lzma2 = @import("compress/lzma2.zig"); pub const xz = @import("compress/xz.zig"); -pub const zlib = @import("compress/zlib.zig"); pub const zstd = @import("compress/zstd.zig"); test { _ = flate; - _ = gzip; _ = lzma; _ = lzma2; _ = xz; - _ = zlib; _ = zstd; } diff --git a/lib/std/compress/flate.zig b/lib/std/compress/flate.zig index 4dbfe7f75a..324ef816e3 100644 --- a/lib/std/compress/flate.zig +++ b/lib/std/compress/flate.zig @@ -1,4 +1,196 @@ +const builtin = @import("builtin"); const std = @import("../std.zig"); +const testing = std.testing; + +/// Container of the deflate bit stream body. Container adds header before +/// deflate bit stream and footer after. It can bi gzip, zlib or raw (no header, +/// no footer, raw bit stream). +/// +/// Zlib format is defined in rfc 1950. Header has 2 bytes and footer 4 bytes +/// addler 32 checksum. +/// +/// Gzip format is defined in rfc 1952. Header has 10+ bytes and footer 4 bytes +/// crc32 checksum and 4 bytes of uncompressed data length. +/// +/// +/// rfc 1950: https://datatracker.ietf.org/doc/html/rfc1950#page-4 +/// rfc 1952: https://datatracker.ietf.org/doc/html/rfc1952#page-5 +pub const Container = enum { + raw, // no header or footer + gzip, // gzip header and footer + zlib, // zlib header and footer + + pub fn size(w: Container) usize { + return headerSize(w) + footerSize(w); + } + + pub fn headerSize(w: Container) usize { + return header(w).len; + } + + pub fn footerSize(w: Container) usize { + return switch (w) { + .gzip => 8, + .zlib => 4, + .raw => 0, + }; + } + + pub const list = [_]Container{ .raw, .gzip, .zlib }; + + pub const Error = error{ + BadGzipHeader, + BadZlibHeader, + WrongGzipChecksum, + WrongGzipSize, + WrongZlibChecksum, + }; + + pub fn header(container: Container) []const u8 { + return switch (container) { + // GZIP 10 byte header (https://datatracker.ietf.org/doc/html/rfc1952#page-5): + // - ID1 (IDentification 1), always 0x1f + // - ID2 (IDentification 2), always 0x8b + // - CM (Compression Method), always 8 = deflate + // - FLG (Flags), all set to 0 + // - 4 bytes, MTIME (Modification time), not used, all set to zero + // - XFL (eXtra FLags), all set to zero + // - OS (Operating System), 03 = Unix + .gzip => &[_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 }, + // ZLIB has a two-byte header (https://datatracker.ietf.org/doc/html/rfc1950#page-4): + // 1st byte: + // - First four bits is the CINFO (compression info), which is 7 for the default deflate window size. + // - The next four bits is the CM (compression method), which is 8 for deflate. + // 2nd byte: + // - Two bits is the FLEVEL (compression level). Values are: 0=fastest, 1=fast, 2=default, 3=best. + // - The next bit, FDICT, is set if a dictionary is given. + // - The final five FCHECK bits form a mod-31 checksum. + // + // CINFO = 7, CM = 8, FLEVEL = 0b10, FDICT = 0, FCHECK = 0b11100 + .zlib => &[_]u8{ 0x78, 0b10_0_11100 }, + .raw => &{}, + }; + } + + pub fn parseHeader(comptime wrap: Container, reader: *std.io.BufferedReader) !void { + switch (wrap) { + .gzip => try parseGzipHeader(reader), + .zlib => try parseZlibHeader(reader), + .raw => {}, + } + } + + fn parseGzipHeader(reader: *std.io.BufferedReader) !void { + const magic1 = try reader.read(u8); + const magic2 = try reader.read(u8); + const method = try reader.read(u8); + const flags = try reader.read(u8); + try reader.skipBytes(6); // mtime(4), xflags, os + if (magic1 != 0x1f or magic2 != 0x8b or method != 0x08) + return error.BadGzipHeader; + // Flags description: https://www.rfc-editor.org/rfc/rfc1952.html#page-5 + if (flags != 0) { + if (flags & 0b0000_0100 != 0) { // FEXTRA + const extra_len = try reader.read(u16); + try reader.skipBytes(extra_len); + } + if (flags & 0b0000_1000 != 0) { // FNAME + try reader.skipStringZ(); + } + if (flags & 0b0001_0000 != 0) { // FCOMMENT + try reader.skipStringZ(); + } + if (flags & 0b0000_0010 != 0) { // FHCRC + try reader.skipBytes(2); + } + } + } + + fn parseZlibHeader(reader: *std.io.BufferedReader) !void { + const cm = try reader.read(u4); + const cinfo = try reader.read(u4); + _ = try reader.read(u8); + if (cm != 8 or cinfo > 7) { + return error.BadZlibHeader; + } + } + + pub fn parseFooter(comptime wrap: Container, hasher: *Hasher(wrap), reader: *std.io.BufferedReader) !void { + switch (wrap) { + .gzip => { + try reader.fill(0); + if (try reader.read(u32) != hasher.chksum()) return error.WrongGzipChecksum; + if (try reader.read(u32) != hasher.bytesRead()) return error.WrongGzipSize; + }, + .zlib => { + const chksum: u32 = @byteSwap(hasher.chksum()); + if (try reader.read(u32) != chksum) return error.WrongZlibChecksum; + }, + .raw => {}, + } + } + + pub const Hasher = union(Container) { + gzip: struct { + crc: std.hash.Crc32 = .init(), + count: usize = 0, + }, + zlib: std.hash.Adler32, + raw: void, + + pub fn init(containter: Container) Hasher { + return switch (containter) { + .gzip => .{ .gzip = .{} }, + .zlib => .{ .zlib = .init() }, + .raw => {}, + }; + } + + pub fn container(h: Hasher) Container { + return h; + } + + pub fn update(h: *Hasher, buf: []const u8) void { + switch (h.*) { + .raw => {}, + .gzip => |*gzip| { + gzip.update(buf); + gzip.count += buf.len; + }, + .zlib => |*zlib| { + zlib.update(buf); + }, + inline .gzip, .zlib => |*x| x.update(buf), + } + } + + pub fn writeFooter(hasher: *Hasher, writer: *std.io.BufferedWriter) std.io.Writer.Error!void { + var bits: [4]u8 = undefined; + switch (hasher.*) { + .gzip => |*gzip| { + // GZIP 8 bytes footer + // - 4 bytes, CRC32 (CRC-32) + // - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32 + std.mem.writeInt(u32, &bits, gzip.final(), .little); + try writer.writeAll(&bits); + + std.mem.writeInt(u32, &bits, gzip.bytes_read, .little); + try writer.writeAll(&bits); + }, + .zlib => |*zlib| { + // ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952). + // 4 bytes of ADLER32 (Adler-32 checksum) + // Checksum value of the uncompressed data (excluding any + // dictionary data) computed according to Adler-32 + // algorithm. + std.mem.writeInt(u32, &bits, zlib.final, .big); + try writer.writeAll(&bits); + }, + .raw => {}, + } + } + }; +}; /// When decompressing, the output buffer is used as the history window, so /// less than this may result in failure to decompress streams that were @@ -7,66 +199,48 @@ pub const max_window_len = 1 << 16; /// Deflate is a lossless data compression file format that uses a combination /// of LZ77 and Huffman coding. -pub const deflate = @import("flate/deflate.zig"); +pub const Compress = @import("flate/Compress.zig"); /// Inflate is the decoding process that takes a Deflate bitstream for /// decompression and correctly produces the original full-size data or file. -pub const inflate = @import("flate/inflate.zig"); - -/// Decompress compressed data from reader and write plain data to the writer. -pub fn decompress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) std.io.Writer.Error!void { - try inflate.decompress(.raw, reader, writer); -} - -pub const Decompressor = inflate.Decompressor(.raw); - -/// Compression level, trades between speed and compression size. -pub const Options = deflate.Options; - -/// Compress plain data from reader and write compressed data to the writer. -pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter, options: Options) std.io.Writer.Error!void { - try deflate.compress(.raw, reader, writer, options); -} - -pub const Compressor = deflate.Compressor(.raw); +pub const Decompress = @import("flate/Decompress.zig"); /// Huffman only compression. Without Lempel-Ziv match searching. Faster /// compression, less memory requirements but bigger compressed sizes. pub const huffman = struct { - pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) std.io.Writer.Error!void { - try deflate.huffman.compress(.raw, reader, writer); - } + // The odd order in which the codegen code sizes are written. + pub const codegen_order = [_]u32{ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 }; + // The number of codegen codes. + pub const codegen_code_count = 19; - pub const Compressor = deflate.huffman.Compressor(.raw); + // The largest distance code. + pub const distance_code_count = 30; + + // Maximum number of literals. + pub const max_num_lit = 286; + + // Max number of frequencies used for a Huffman Code + // Possible lengths are codegen_code_count (19), distance_code_count (30) and max_num_lit (286). + // The largest of these is max_num_lit. + pub const max_num_frequencies = max_num_lit; + + // Biggest block size for uncompressed block. + pub const max_store_block_size = 65535; + // The special code used to mark the end of a block. + pub const end_block_marker = 256; }; -// No compression store only. Compressed size is slightly bigger than plain. -pub const store = struct { - pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) std.io.Writer.Error!void { - try deflate.store.compress(.raw, reader, writer); - } - - pub const Compressor = deflate.store.Compressor(.raw); -}; - -const builtin = @import("builtin"); -const testing = std.testing; -const fixedBufferStream = std.io.fixedBufferStream; -const print = std.debug.print; -/// Container defines header/footer around deflate bit stream. Gzip and zlib -/// compression algorithms are containers around deflate bit stream body. -const Container = @import("flate/container.zig").Container; - test { - _ = deflate; - _ = inflate; + _ = Compress; + _ = Decompress; } test "compress/decompress" { + const print = std.debug.print; var cmp_buf: [64 * 1024]u8 = undefined; // compressed data buffer var dcm_buf: [64 * 1024]u8 = undefined; // decompressed data buffer - const levels = [_]deflate.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 }; + const levels = [_]Compress.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 }; const cases = [_]struct { data: []const u8, // uncompressed content // compressed data sizes per level 4-9 @@ -113,9 +287,11 @@ test "compress/decompress" { // compress original stream to compressed stream { - var original = fixedBufferStream(data); - var compressed = fixedBufferStream(&cmp_buf); - try deflate.compress(container, original.reader(), compressed.writer(), .{ .level = level }); + var original: std.io.BufferedReader = undefined; + original.initFixed(data); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&cmp_buf); + try Compress.pump(container, original.reader(), &compressed, .{ .level = level }); if (compressed_size == 0) { if (container == .gzip) print("case {d} gzip level {} compressed size: {d}\n", .{ case_no, level, compressed.pos }); @@ -125,16 +301,19 @@ test "compress/decompress" { } // decompress compressed stream to decompressed stream { - var compressed = fixedBufferStream(cmp_buf[0..compressed_size]); - var decompressed = fixedBufferStream(&dcm_buf); - try inflate.decompress(container, compressed.reader(), decompressed.writer()); + var compressed: std.io.BufferedReader = undefined; + compressed.initFixed(cmp_buf[0..compressed_size]); + var decompressed: std.io.BufferedWriter = undefined; + decompressed.initFixed(&dcm_buf); + try Decompress.pump(container, &compressed, &decompressed); try testing.expectEqualSlices(u8, data, decompressed.getWritten()); } // compressor writer interface { - var compressed = fixedBufferStream(&cmp_buf); - var cmp = try deflate.compressor(container, compressed.writer(), .{ .level = level }); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&cmp_buf); + var cmp = try Compress.init(container, &compressed, .{ .level = level }); var cmp_wrt = cmp.writer(); try cmp_wrt.writeAll(data); try cmp.finish(); @@ -143,8 +322,9 @@ test "compress/decompress" { } // decompressor reader interface { - var compressed = fixedBufferStream(cmp_buf[0..compressed_size]); - var dcm = inflate.decompressor(container, compressed.reader()); + var compressed: std.io.BufferedReader = undefined; + compressed.initFixed(cmp_buf[0..compressed_size]); + var dcm = Decompress.pump(container, &compressed); var dcm_rdr = dcm.reader(); const n = try dcm_rdr.readAll(&dcm_buf); try testing.expectEqual(data.len, n); @@ -162,9 +342,11 @@ test "compress/decompress" { // compress original stream to compressed stream { - var original = fixedBufferStream(data); - var compressed = fixedBufferStream(&cmp_buf); - var cmp = try deflate.huffman.compressor(container, compressed.writer()); + var original: std.io.BufferedReader = undefined; + original.initFixed(data); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&cmp_buf); + var cmp = try Compress.Huffman.init(container, &compressed); try cmp.compress(original.reader()); try cmp.finish(); if (compressed_size == 0) { @@ -176,9 +358,11 @@ test "compress/decompress" { } // decompress compressed stream to decompressed stream { - var compressed = fixedBufferStream(cmp_buf[0..compressed_size]); - var decompressed = fixedBufferStream(&dcm_buf); - try inflate.decompress(container, compressed.reader(), decompressed.writer()); + var compressed: std.io.BufferedReader = undefined; + compressed.initFixed(cmp_buf[0..compressed_size]); + var decompressed: std.io.BufferedWriter = undefined; + decompressed.initFixed(&dcm_buf); + try Decompress.pump(container, &compressed, &decompressed); try testing.expectEqualSlices(u8, data, decompressed.getWritten()); } } @@ -194,9 +378,11 @@ test "compress/decompress" { // compress original stream to compressed stream { - var original = fixedBufferStream(data); - var compressed = fixedBufferStream(&cmp_buf); - var cmp = try deflate.store.compressor(container, compressed.writer()); + var original: std.io.BufferedReader = undefined; + original.initFixed(data); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&cmp_buf); + var cmp = try Compress.SimpleCompressor(.store, container).init(&compressed); try cmp.compress(original.reader()); try cmp.finish(); if (compressed_size == 0) { @@ -209,9 +395,11 @@ test "compress/decompress" { } // decompress compressed stream to decompressed stream { - var compressed = fixedBufferStream(cmp_buf[0..compressed_size]); - var decompressed = fixedBufferStream(&dcm_buf); - try inflate.decompress(container, compressed.reader(), decompressed.writer()); + var compressed: std.io.BufferedReader = undefined; + compressed.initFixed(cmp_buf[0..compressed_size]); + var decompressed: std.io.BufferedWriter = undefined; + decompressed.initFixed(&dcm_buf); + try Decompress.pump(container, &compressed, &decompressed); try testing.expectEqualSlices(u8, data, decompressed.getWritten()); } } @@ -220,11 +408,13 @@ test "compress/decompress" { } fn testDecompress(comptime container: Container, compressed: []const u8, expected_plain: []const u8) !void { - var in = fixedBufferStream(compressed); - var out = std.ArrayList(u8).init(testing.allocator); + var in: std.io.BufferedReader = undefined; + in.initFixed(compressed); + var out: std.io.AllocatingWriter = undefined; + out.init(testing.allocator); defer out.deinit(); - try inflate.decompress(container, in.reader(), out.writer()); + try Decompress.pump(container, &in, &out.buffered_writer); try testing.expectEqualSlices(u8, expected_plain, out.items); } @@ -337,23 +527,24 @@ test "public interface" { 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen } ++ plain_data; - // gzip header/footer + deflate block - const gzip_data = - [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 } ++ // gzip header (10 bytes) - deflate_block ++ - [_]u8{ 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00 }; // gzip footer checksum (4 byte), size (4 bytes) + //// gzip header/footer + deflate block + //const gzip_data = + // [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 } ++ // gzip header (10 bytes) + // deflate_block ++ + // [_]u8{ 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00 }; // gzip footer checksum (4 byte), size (4 bytes) - // zlib header/footer + deflate block - const zlib_data = [_]u8{ 0x78, 0b10_0_11100 } ++ // zlib header (2 bytes)} - deflate_block ++ - [_]u8{ 0x1c, 0xf2, 0x04, 0x47 }; // zlib footer: checksum + //// zlib header/footer + deflate block + //const zlib_data = [_]u8{ 0x78, 0b10_0_11100 } ++ // zlib header (2 bytes)} + // deflate_block ++ + // [_]u8{ 0x1c, 0xf2, 0x04, 0x47 }; // zlib footer: checksum - const gzip = @import("gzip.zig"); - const zlib = @import("zlib.zig"); + // TODO + //const gzip = @import("gzip.zig"); + //const zlib = @import("zlib.zig"); const flate = @This(); - try testInterface(gzip, &gzip_data, &plain_data); - try testInterface(zlib, &zlib_data, &plain_data); + //try testInterface(gzip, &gzip_data, &plain_data); + //try testInterface(zlib, &zlib_data, &plain_data); try testInterface(flate, &deflate_block, &plain_data); } @@ -361,95 +552,181 @@ fn testInterface(comptime pkg: type, gzip_data: []const u8, plain_data: []const var buffer1: [64]u8 = undefined; var buffer2: [64]u8 = undefined; - var compressed = fixedBufferStream(&buffer1); - var plain = fixedBufferStream(&buffer2); - // decompress { - var in = fixedBufferStream(gzip_data); - try pkg.decompress(in.reader(), plain.writer()); + var plain: std.io.BufferedWriter = undefined; + plain.initFixed(&buffer2); + + var in: std.io.BufferedReader = undefined; + in.initFixed(gzip_data); + try pkg.decompress(&in, &plain); try testing.expectEqualSlices(u8, plain_data, plain.getWritten()); } - plain.reset(); - compressed.reset(); // compress/decompress { - var in = fixedBufferStream(plain_data); - try pkg.compress(in.reader(), compressed.writer(), .{}); - compressed.reset(); - try pkg.decompress(compressed.reader(), plain.writer()); + var plain: std.io.BufferedWriter = undefined; + plain.initFixed(&buffer2); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&buffer1); + + var in: std.io.BufferedReader = undefined; + in.initFixed(plain_data); + try pkg.compress(&in, &compressed, .{}); + + var compressed_br: std.io.BufferedReader = undefined; + compressed_br.initFixed(&buffer1); + try pkg.decompress(&compressed_br, &plain); try testing.expectEqualSlices(u8, plain_data, plain.getWritten()); } - plain.reset(); - compressed.reset(); // compressor/decompressor { - var in = fixedBufferStream(plain_data); - var cmp = try pkg.compressor(compressed.writer(), .{}); - try cmp.compress(in.reader()); + var plain: std.io.BufferedWriter = undefined; + plain.initFixed(&buffer2); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&buffer1); + + var in: std.io.BufferedReader = undefined; + in.initFixed(plain_data); + var cmp = try pkg.compressor(&compressed, .{}); + try cmp.compress(&in); try cmp.finish(); - compressed.reset(); - var dcp = pkg.decompressor(compressed.reader()); - try dcp.decompress(plain.writer()); + var compressed_br: std.io.BufferedReader = undefined; + compressed_br.initFixed(&buffer1); + var dcp = pkg.decompressor(&compressed_br); + try dcp.decompress(&plain); try testing.expectEqualSlices(u8, plain_data, plain.getWritten()); } - plain.reset(); - compressed.reset(); // huffman { // huffman compress/decompress { - var in = fixedBufferStream(plain_data); - try pkg.huffman.compress(in.reader(), compressed.writer()); - compressed.reset(); - try pkg.decompress(compressed.reader(), plain.writer()); + var plain: std.io.BufferedWriter = undefined; + plain.initFixed(&buffer2); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&buffer1); + + var in: std.io.BufferedReader = undefined; + in.initFixed(plain_data); + try pkg.huffman.compress(&in, &compressed); + + var compressed_br: std.io.BufferedReader = undefined; + compressed_br.initFixed(&buffer1); + try pkg.decompress(&compressed_br, &plain); try testing.expectEqualSlices(u8, plain_data, plain.getWritten()); } - plain.reset(); - compressed.reset(); // huffman compressor/decompressor { - var in = fixedBufferStream(plain_data); - var cmp = try pkg.huffman.compressor(compressed.writer()); - try cmp.compress(in.reader()); + var plain: std.io.BufferedWriter = undefined; + plain.initFixed(&buffer2); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&buffer1); + + var in: std.io.BufferedReader = undefined; + in.initFixed(plain_data); + var cmp = try pkg.huffman.compressor(&compressed); + try cmp.compress(&in); try cmp.finish(); - compressed.reset(); - try pkg.decompress(compressed.reader(), plain.writer()); + var compressed_br: std.io.BufferedReader = undefined; + compressed_br.initFixed(&buffer1); + try pkg.decompress(&compressed_br, &plain); try testing.expectEqualSlices(u8, plain_data, plain.getWritten()); } } - plain.reset(); - compressed.reset(); // store { // store compress/decompress { - var in = fixedBufferStream(plain_data); - try pkg.store.compress(in.reader(), compressed.writer()); - compressed.reset(); - try pkg.decompress(compressed.reader(), plain.writer()); + var plain: std.io.BufferedWriter = undefined; + plain.initFixed(&buffer2); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&buffer1); + + var in: std.io.BufferedReader = undefined; + in.initFixed(plain_data); + try pkg.store.compress(&in, &compressed); + + var compressed_br: std.io.BufferedReader = undefined; + compressed_br.initFixed(&buffer1); + try pkg.decompress(&compressed_br, &plain); try testing.expectEqualSlices(u8, plain_data, plain.getWritten()); } - plain.reset(); - compressed.reset(); // store compressor/decompressor { - var in = fixedBufferStream(plain_data); - var cmp = try pkg.store.compressor(compressed.writer()); - try cmp.compress(in.reader()); + var plain: std.io.BufferedWriter = undefined; + plain.initFixed(&buffer2); + var compressed: std.io.BufferedWriter = undefined; + compressed.initFixed(&buffer1); + + var in: std.io.BufferedReader = undefined; + in.initFixed(plain_data); + var cmp = try pkg.store.compressor(&compressed); + try cmp.compress(&in); try cmp.finish(); - compressed.reset(); - try pkg.decompress(compressed.reader(), plain.writer()); + var compressed_br: std.io.BufferedReader = undefined; + compressed_br.initFixed(&buffer1); + try pkg.decompress(&compressed_br, &plain); try testing.expectEqualSlices(u8, plain_data, plain.getWritten()); } } } + +pub const match = struct { + pub const base_length = 3; // smallest match length per the RFC section 3.2.5 + pub const min_length = 4; // min length used in this algorithm + pub const max_length = 258; + + pub const min_distance = 1; + pub const max_distance = 32768; +}; + +pub const history = struct { + pub const len = match.max_distance; +}; + +pub const lookup = struct { + pub const bits = 15; + pub const len = 1 << bits; + pub const shift = 32 - bits; +}; + +test "zlib should not overshoot" { + // Compressed zlib data with extra 4 bytes at the end. + const data = [_]u8{ + 0x78, 0x9c, 0x73, 0xce, 0x2f, 0xa8, 0x2c, 0xca, 0x4c, 0xcf, 0x28, 0x51, 0x08, 0xcf, 0xcc, 0xc9, + 0x49, 0xcd, 0x55, 0x28, 0x4b, 0xcc, 0x53, 0x08, 0x4e, 0xce, 0x48, 0xcc, 0xcc, 0xd6, 0x51, 0x08, + 0xce, 0xcc, 0x4b, 0x4f, 0x2c, 0xc8, 0x2f, 0x4a, 0x55, 0x30, 0xb4, 0xb4, 0x34, 0xd5, 0xb5, 0x34, + 0x03, 0x00, 0x8b, 0x61, 0x0f, 0xa4, 0x52, 0x5a, 0x94, 0x12, + }; + + var stream = std.io.fixedBufferStream(data[0..]); + const reader = stream.reader(); + + var dcp = Decompress.init(reader); + var out: [128]u8 = undefined; + + // Decompress + var n = try dcp.reader().readAll(out[0..]); + + // Expected decompressed data + try std.testing.expectEqual(46, n); + try std.testing.expectEqualStrings("Copyright Willem van Schaik, Singapore 1995-96", out[0..n]); + + // Decompressor don't overshoot underlying reader. + // It is leaving it at the end of compressed data chunk. + try std.testing.expectEqual(data.len - 4, stream.getPos()); + try std.testing.expectEqual(0, dcp.unreadBytes()); + + // 4 bytes after compressed chunk are available in reader. + n = try reader.readAll(out[0..]); + try std.testing.expectEqual(n, 4); + try std.testing.expectEqualSlices(u8, data[data.len - 4 .. data.len], out[0..n]); +} diff --git a/lib/std/compress/flate/BitWriter.zig b/lib/std/compress/flate/BitWriter.zig deleted file mode 100644 index bcc85268d1..0000000000 --- a/lib/std/compress/flate/BitWriter.zig +++ /dev/null @@ -1,93 +0,0 @@ -//! Bit writer for use in deflate (compression). -//! -//! Has internal bits buffer of 64 bits and internal bytes buffer of 248 bytes. -//! When we accumulate 48 bits 6 bytes are moved to the bytes buffer. When we -//! accumulate 240 bytes they are flushed to the underlying inner_writer. - -const std = @import("std"); -const assert = std.debug.assert; -const BitWriter = @This(); - -// buffer_flush_size indicates the buffer size -// after which bytes are flushed to the writer. -// Should preferably be a multiple of 6, since -// we accumulate 6 bytes between writes to the buffer. -const buffer_flush_size = 240; - -// buffer_size is the actual output byte buffer size. -// It must have additional headroom for a flush -// which can contain up to 8 bytes. -const buffer_size = buffer_flush_size + 8; - -inner_writer: *std.io.BufferedWriter, - -// Data waiting to be written is bytes[0 .. nbytes] -// and then the low nbits of bits. Data is always written -// sequentially into the bytes array. -bits: u64 = 0, -nbits: u32 = 0, // number of bits -bytes: [buffer_size]u8 = undefined, -nbytes: u32 = 0, // number of bytes - -const Self = @This(); - -pub fn init(bw: *std.io.BufferedWriter) Self { - return .{ .inner_writer = bw }; -} - -pub fn setWriter(self: *Self, new_writer: *std.io.BufferedWriter) void { - self.inner_writer = new_writer; -} - -pub fn flush(self: *Self) std.io.Writer.Error!void { - var n = self.nbytes; - while (self.nbits != 0) { - self.bytes[n] = @as(u8, @truncate(self.bits)); - self.bits >>= 8; - if (self.nbits > 8) { // Avoid underflow - self.nbits -= 8; - } else { - self.nbits = 0; - } - n += 1; - } - self.bits = 0; - _ = try self.inner_writer.write(self.bytes[0..n]); - self.nbytes = 0; -} - -pub fn writeBits(self: *Self, b: u32, nb: u32) std.io.Writer.Error!void { - self.bits |= @as(u64, @intCast(b)) << @as(u6, @intCast(self.nbits)); - self.nbits += nb; - if (self.nbits < 48) - return; - - var n = self.nbytes; - std.mem.writeInt(u64, self.bytes[n..][0..8], self.bits, .little); - n += 6; - if (n >= buffer_flush_size) { - _ = try self.inner_writer.write(self.bytes[0..n]); - n = 0; - } - self.nbytes = n; - self.bits >>= 48; - self.nbits -= 48; -} - -pub fn writeBytes(self: *Self, bytes: []const u8) std.io.Writer.Error!void { - var n = self.nbytes; - if (self.nbits & 7 != 0) { - return error.UnfinishedBits; - } - while (self.nbits != 0) { - self.bytes[n] = @as(u8, @truncate(self.bits)); - self.bits >>= 8; - self.nbits -= 8; - n += 1; - } - if (n != 0) { - _ = try self.inner_writer.write(self.bytes[0..n]); - } - self.nbytes = 0; - _ = try self.inner_writer.write(bytes); -} diff --git a/lib/std/compress/flate/BlockWriter.zig b/lib/std/compress/flate/BlockWriter.zig index c6183ad820..21d56575e7 100644 --- a/lib/std/compress/flate/BlockWriter.zig +++ b/lib/std/compress/flate/BlockWriter.zig @@ -4,35 +4,34 @@ const std = @import("std"); const io = std.io; const assert = std.debug.assert; -const hc = @import("huffman_encoder.zig"); -const consts = @import("consts.zig").huffman; -const Token = @import("Token.zig"); -const BitWriter = @import("BitWriter.zig"); const BlockWriter = @This(); +const flate = @import("../flate.zig"); +const Compress = flate.Compress; +const huffman = flate.huffman; +const Token = @import("Token.zig"); -const codegen_order = consts.codegen_order; +const codegen_order = huffman.codegen_order; const end_code_mark = 255; -const Self = @This(); -bit_writer: BitWriter, +output: *std.io.BufferedWriter, -codegen_freq: [consts.codegen_code_count]u16 = undefined, -literal_freq: [consts.max_num_lit]u16 = undefined, -distance_freq: [consts.distance_code_count]u16 = undefined, -codegen: [consts.max_num_lit + consts.distance_code_count + 1]u8 = undefined, -literal_encoding: hc.LiteralEncoder = .{}, -distance_encoding: hc.DistanceEncoder = .{}, -codegen_encoding: hc.CodegenEncoder = .{}, -fixed_literal_encoding: hc.LiteralEncoder, -fixed_distance_encoding: hc.DistanceEncoder, -huff_distance: hc.DistanceEncoder, +codegen_freq: [huffman.codegen_code_count]u16 = undefined, +literal_freq: [huffman.max_num_lit]u16 = undefined, +distance_freq: [huffman.distance_code_count]u16 = undefined, +codegen: [huffman.max_num_lit + huffman.distance_code_count + 1]u8 = undefined, +literal_encoding: Compress.LiteralEncoder = .{}, +distance_encoding: Compress.DistanceEncoder = .{}, +codegen_encoding: Compress.CodegenEncoder = .{}, +fixed_literal_encoding: Compress.LiteralEncoder, +fixed_distance_encoding: Compress.DistanceEncoder, +huff_distance: Compress.DistanceEncoder, -pub fn init(writer: *std.io.BufferedWriter) Self { +pub fn init(output: *std.io.BufferedWriter) BlockWriter { return .{ - .bit_writer = BitWriter.init(writer), - .fixed_literal_encoding = hc.fixedLiteralEncoder(), - .fixed_distance_encoding = hc.fixedDistanceEncoder(), - .huff_distance = hc.huffmanDistanceEncoder(), + .output = output, + .fixed_literal_encoding = Compress.fixedLiteralEncoder(), + .fixed_distance_encoding = Compress.fixedDistanceEncoder(), + .huff_distance = Compress.huffmanDistanceEncoder(), }; } @@ -42,15 +41,15 @@ pub fn init(writer: *std.io.BufferedWriter) Self { /// That is after final block; when last byte could be incomplete or /// after stored block; which is aligned to the byte boundary (it has x /// padding bits after first 3 bits). -pub fn flush(self: *Self) std.io.Writer.Error!void { +pub fn flush(self: *BlockWriter) std.io.Writer.Error!void { try self.bit_writer.flush(); } -pub fn setWriter(self: *Self, new_writer: *std.io.BufferedWriter) void { +pub fn setWriter(self: *BlockWriter, new_writer: *std.io.BufferedWriter) void { self.bit_writer.setWriter(new_writer); } -fn writeCode(self: *Self, c: hc.HuffCode) std.io.Writer.Error!void { +fn writeCode(self: *BlockWriter, c: Compress.HuffCode) std.io.Writer.Error!void { try self.bit_writer.writeBits(c.code, c.len); } @@ -68,11 +67,11 @@ fn writeCode(self: *Self, c: hc.HuffCode) std.io.Writer.Error!void { // lit_enc: The literal encoder to use // dist_enc: The distance encoder to use fn generateCodegen( - self: *Self, + self: *BlockWriter, num_literals: u32, num_distances: u32, - lit_enc: *hc.LiteralEncoder, - dist_enc: *hc.DistanceEncoder, + lit_enc: *Compress.LiteralEncoder, + dist_enc: *Compress.DistanceEncoder, ) void { for (self.codegen_freq, 0..) |_, i| { self.codegen_freq[i] = 0; @@ -169,9 +168,9 @@ const DynamicSize = struct { // dynamicSize returns the size of dynamically encoded data in bits. fn dynamicSize( - self: *Self, - lit_enc: *hc.LiteralEncoder, // literal encoder - dist_enc: *hc.DistanceEncoder, // distance encoder + self: *BlockWriter, + lit_enc: *Compress.LiteralEncoder, // literal encoder + dist_enc: *Compress.DistanceEncoder, // distance encoder extra_bits: u32, ) DynamicSize { var num_codegens = self.codegen_freq.len; @@ -195,7 +194,7 @@ fn dynamicSize( } // fixedSize returns the size of dynamically encoded data in bits. -fn fixedSize(self: *Self, extra_bits: u32) u32 { +fn fixedSize(self: *BlockWriter, extra_bits: u32) u32 { return 3 + self.fixed_literal_encoding.bitLength(&self.literal_freq) + self.fixed_distance_encoding.bitLength(&self.distance_freq) + @@ -214,7 +213,7 @@ fn storedSizeFits(in: ?[]const u8) StoredSize { if (in == null) { return .{ .size = 0, .storable = false }; } - if (in.?.len <= consts.max_store_block_size) { + if (in.?.len <= huffman.max_store_block_size) { return .{ .size = @as(u32, @intCast((in.?.len + 5) * 8)), .storable = true }; } return .{ .size = 0, .storable = false }; @@ -227,7 +226,7 @@ fn storedSizeFits(in: ?[]const u8) StoredSize { // num_codegens: The number of codegens used in codegen // eof: Is it the end-of-file? (end of stream) fn dynamicHeader( - self: *Self, + self: *BlockWriter, num_literals: u32, num_distances: u32, num_codegens: u32, @@ -272,7 +271,7 @@ fn dynamicHeader( } } -fn storedHeader(self: *Self, length: usize, eof: bool) std.io.Writer.Error!void { +fn storedHeader(self: *BlockWriter, length: usize, eof: bool) std.io.Writer.Error!void { assert(length <= 65535); const flag: u32 = if (eof) 1 else 0; try self.bit_writer.writeBits(flag, 3); @@ -282,7 +281,7 @@ fn storedHeader(self: *Self, length: usize, eof: bool) std.io.Writer.Error!void try self.bit_writer.writeBits(~l, 16); } -fn fixedHeader(self: *Self, eof: bool) std.io.Writer.Error!void { +fn fixedHeader(self: *BlockWriter, eof: bool) std.io.Writer.Error!void { // Indicate that we are a fixed Huffman block var value: u32 = 2; if (eof) { @@ -296,7 +295,7 @@ fn fixedHeader(self: *Self, eof: bool) std.io.Writer.Error!void { // is larger than the original bytes, the data will be written as a // stored block. // If the input is null, the tokens will always be Huffman encoded. -pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) std.io.Writer.Error!void { +pub fn write(self: *BlockWriter, tokens: []const Token, eof: bool, input: ?[]const u8) std.io.Writer.Error!void { const lit_and_dist = self.indexTokens(tokens); const num_literals = lit_and_dist.num_literals; const num_distances = lit_and_dist.num_distances; @@ -374,7 +373,7 @@ pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes); } -pub fn storedBlock(self: *Self, input: []const u8, eof: bool) std.io.Writer.Error!void { +pub fn storedBlock(self: *BlockWriter, input: []const u8, eof: bool) std.io.Writer.Error!void { try self.storedHeader(input.len, eof); try self.bit_writer.writeBytes(input); } @@ -385,7 +384,7 @@ pub fn storedBlock(self: *Self, input: []const u8, eof: bool) std.io.Writer.Erro // If input is supplied and the compression savings are below 1/16th of the // input size the block is stored. fn dynamicBlock( - self: *Self, + self: *BlockWriter, tokens: []const Token, eof: bool, input: ?[]const u8, @@ -433,7 +432,7 @@ const TotalIndexedTokens = struct { // literal_freq and distance_freq, and generates literal_encoding // and distance_encoding. // The number of literal and distance tokens is returned. -fn indexTokens(self: *Self, tokens: []const Token) TotalIndexedTokens { +fn indexTokens(self: *BlockWriter, tokens: []const Token) TotalIndexedTokens { var num_literals: u32 = 0; var num_distances: u32 = 0; @@ -453,7 +452,7 @@ fn indexTokens(self: *Self, tokens: []const Token) TotalIndexedTokens { self.distance_freq[t.distanceCode()] += 1; } // add end_block_marker token at the end - self.literal_freq[consts.end_block_marker] += 1; + self.literal_freq[huffman.end_block_marker] += 1; // get the number of literals num_literals = @as(u32, @intCast(self.literal_freq.len)); @@ -482,10 +481,10 @@ fn indexTokens(self: *Self, tokens: []const Token) TotalIndexedTokens { // Writes a slice of tokens to the output followed by and end_block_marker. // codes for literal and distance encoding must be supplied. fn writeTokens( - self: *Self, + self: *BlockWriter, tokens: []const Token, - le_codes: []hc.HuffCode, - oe_codes: []hc.HuffCode, + le_codes: []Compress.HuffCode, + oe_codes: []Compress.HuffCode, ) std.io.Writer.Error!void { for (tokens) |t| { if (t.kind == Token.Kind.literal) { @@ -508,18 +507,18 @@ fn writeTokens( } } // add end_block_marker at the end - try self.writeCode(le_codes[consts.end_block_marker]); + try self.writeCode(le_codes[huffman.end_block_marker]); } // Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes // if the results only gains very little from compression. -pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) std.io.Writer.Error!void { +pub fn huffmanBlock(self: *BlockWriter, input: []const u8, eof: bool) std.io.Writer.Error!void { // Add everything as literals histogram(input, &self.literal_freq); - self.literal_freq[consts.end_block_marker] = 1; + self.literal_freq[huffman.end_block_marker] = 1; - const num_literals = consts.end_block_marker + 1; + const num_literals = huffman.end_block_marker + 1; self.distance_freq[0] = 1; const num_distances = 1; @@ -560,7 +559,7 @@ pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) std.io.Writer.Err const c = encoding[t]; try self.bit_writer.writeBits(c.code, c.len); } - try self.writeCode(encoding[consts.end_block_marker]); + try self.writeCode(encoding[huffman.end_block_marker]); } // histogram accumulates a histogram of b in h. diff --git a/lib/std/compress/flate/CircularBuffer.zig b/lib/std/compress/flate/CircularBuffer.zig deleted file mode 100644 index 552d364894..0000000000 --- a/lib/std/compress/flate/CircularBuffer.zig +++ /dev/null @@ -1,240 +0,0 @@ -//! 64K buffer of uncompressed data created in inflate (decompression). Has enough -//! history to support writing match; copying length of bytes -//! from the position distance backward from current. -//! -//! Reads can return less than available bytes if they are spread across -//! different circles. So reads should repeat until get required number of bytes -//! or until returned slice is zero length. -//! -//! Note on deflate limits: -//! * non-compressible block is limited to 65,535 bytes. -//! * backward pointer is limited in distance to 32K bytes and in length to 258 bytes. -//! -//! Whole non-compressed block can be written without overlap. We always have -//! history of up to 64K, more then 32K needed. -//! -const std = @import("std"); -const assert = std.debug.assert; -const testing = std.testing; - -const consts = @import("consts.zig").match; - -const mask = 0xffff; // 64K - 1 -const buffer_len = mask + 1; // 64K buffer - -const Self = @This(); - -buffer: [buffer_len]u8 = undefined, -wp: usize = 0, // write position -rp: usize = 0, // read position - -fn writeAll(self: *Self, buf: []const u8) void { - for (buf) |c| self.write(c); -} - -/// Write literal. -pub fn write(self: *Self, b: u8) void { - assert(self.wp - self.rp < mask); - self.buffer[self.wp & mask] = b; - self.wp += 1; -} - -/// Write match (back-reference to the same data slice) starting at `distance` -/// back from current write position, and `length` of bytes. -pub fn writeMatch(self: *Self, length: u16, distance: u16) !void { - if (self.wp < distance or - length < consts.base_length or length > consts.max_length or - distance < consts.min_distance or distance > consts.max_distance) - { - return error.InvalidMatch; - } - assert(self.wp - self.rp < mask); - - var from: usize = self.wp - distance & mask; - const from_end: usize = from + length; - var to: usize = self.wp & mask; - const to_end: usize = to + length; - - self.wp += length; - - // Fast path using memcpy - if (from_end < buffer_len and to_end < buffer_len) // start and end at the same circle - { - var cur_len = distance; - var remaining_len = length; - while (cur_len < remaining_len) { - @memcpy(self.buffer[to..][0..cur_len], self.buffer[from..][0..cur_len]); - to += cur_len; - remaining_len -= cur_len; - cur_len = cur_len * 2; - } - @memcpy(self.buffer[to..][0..remaining_len], self.buffer[from..][0..remaining_len]); - return; - } - - // Slow byte by byte - while (to < to_end) { - self.buffer[to & mask] = self.buffer[from & mask]; - to += 1; - from += 1; - } -} - -/// Returns writable part of the internal buffer of size `n` at most. Advances -/// write pointer, assumes that returned buffer will be filled with data. -pub fn getWritable(self: *Self, n: usize) []u8 { - const wp = self.wp & mask; - const len = @min(n, buffer_len - wp); - self.wp += len; - return self.buffer[wp .. wp + len]; -} - -/// Read available data. Can return part of the available data if it is -/// spread across two circles. So read until this returns zero length. -pub fn read(self: *Self) []const u8 { - return self.readAtMost(buffer_len); -} - -/// Read part of available data. Can return less than max even if there are -/// more than max decoded data. -pub fn readAtMost(self: *Self, limit: usize) []const u8 { - const rb = self.readBlock(if (limit == 0) buffer_len else limit); - defer self.rp += rb.len; - return self.buffer[rb.head..rb.tail]; -} - -const ReadBlock = struct { - head: usize, - tail: usize, - len: usize, -}; - -/// Returns position of continuous read block data. -fn readBlock(self: *Self, max: usize) ReadBlock { - const r = self.rp & mask; - const w = self.wp & mask; - const n = @min( - max, - if (w >= r) w - r else buffer_len - r, - ); - return .{ - .head = r, - .tail = r + n, - .len = n, - }; -} - -/// Number of free bytes for write. -pub fn free(self: *Self) usize { - return buffer_len - (self.wp - self.rp); -} - -/// Full if largest match can't fit. 258 is largest match length. That much -/// bytes can be produced in single decode step. -pub fn full(self: *Self) bool { - return self.free() < 258 + 1; -} - -// example from: https://youtu.be/SJPvNi4HrWQ?t=3558 -test writeMatch { - var cb: Self = .{}; - - cb.writeAll("a salad; "); - try cb.writeMatch(5, 9); - try cb.writeMatch(3, 3); - - try testing.expectEqualStrings("a salad; a salsal", cb.read()); -} - -test "writeMatch overlap" { - var cb: Self = .{}; - - cb.writeAll("a b c "); - try cb.writeMatch(8, 4); - cb.write('d'); - - try testing.expectEqualStrings("a b c b c b c d", cb.read()); -} - -test readAtMost { - var cb: Self = .{}; - - cb.writeAll("0123456789"); - try cb.writeMatch(50, 10); - - try testing.expectEqualStrings("0123456789" ** 6, cb.buffer[cb.rp..cb.wp]); - for (0..6) |i| { - try testing.expectEqual(i * 10, cb.rp); - try testing.expectEqualStrings("0123456789", cb.readAtMost(10)); - } - try testing.expectEqualStrings("", cb.readAtMost(10)); - try testing.expectEqualStrings("", cb.read()); -} - -test Self { - var cb: Self = .{}; - - const data = "0123456789abcdef" ** (1024 / 16); - cb.writeAll(data); - try testing.expectEqual(@as(usize, 0), cb.rp); - try testing.expectEqual(@as(usize, 1024), cb.wp); - try testing.expectEqual(@as(usize, 1024 * 63), cb.free()); - - for (0..62 * 4) |_| - try cb.writeMatch(256, 1024); // write 62K - - try testing.expectEqual(@as(usize, 0), cb.rp); - try testing.expectEqual(@as(usize, 63 * 1024), cb.wp); - try testing.expectEqual(@as(usize, 1024), cb.free()); - - cb.writeAll(data[0..200]); - _ = cb.readAtMost(1024); // make some space - cb.writeAll(data); // overflows write position - try testing.expectEqual(@as(usize, 200 + 65536), cb.wp); - try testing.expectEqual(@as(usize, 1024), cb.rp); - try testing.expectEqual(@as(usize, 1024 - 200), cb.free()); - - const rb = cb.readBlock(Self.buffer_len); - try testing.expectEqual(@as(usize, 65536 - 1024), rb.len); - try testing.expectEqual(@as(usize, 1024), rb.head); - try testing.expectEqual(@as(usize, 65536), rb.tail); - - try testing.expectEqual(@as(usize, 65536 - 1024), cb.read().len); // read to the end of the buffer - try testing.expectEqual(@as(usize, 200 + 65536), cb.wp); - try testing.expectEqual(@as(usize, 65536), cb.rp); - try testing.expectEqual(@as(usize, 65536 - 200), cb.free()); - - try testing.expectEqual(@as(usize, 200), cb.read().len); // read the rest -} - -test "write overlap" { - var cb: Self = .{}; - cb.wp = cb.buffer.len - 15; - cb.rp = cb.wp; - - cb.writeAll("0123456789"); - cb.writeAll("abcdefghij"); - - try testing.expectEqual(cb.buffer.len + 5, cb.wp); - try testing.expectEqual(cb.buffer.len - 15, cb.rp); - - try testing.expectEqualStrings("0123456789abcde", cb.read()); - try testing.expectEqualStrings("fghij", cb.read()); - - try testing.expect(cb.wp == cb.rp); -} - -test "writeMatch/read overlap" { - var cb: Self = .{}; - cb.wp = cb.buffer.len - 15; - cb.rp = cb.wp; - - cb.writeAll("0123456789"); - try cb.writeMatch(15, 5); - - try testing.expectEqualStrings("012345678956789", cb.read()); - try testing.expectEqualStrings("5678956789", cb.read()); - - try cb.writeMatch(20, 25); - try testing.expectEqualStrings("01234567895678956789", cb.read()); -} diff --git a/lib/std/compress/flate/Compress.zig b/lib/std/compress/flate/Compress.zig new file mode 100644 index 0000000000..22a88d2f60 --- /dev/null +++ b/lib/std/compress/flate/Compress.zig @@ -0,0 +1,1230 @@ +//! Default compression algorithm. Has two steps: tokenization and token +//! encoding. +//! +//! Tokenization takes uncompressed input stream and produces list of tokens. +//! Each token can be literal (byte of data) or match (backrefernce to previous +//! data with length and distance). Tokenization accumulators 32K tokens, when +//! full or `flush` is called tokens are passed to the `block_writer`. Level +//! defines how hard (how slow) it tries to find match. +//! +//! Block writer will decide which type of deflate block to write (stored, fixed, +//! dynamic) and encode tokens to the output byte stream. Client has to call +//! `finish` to write block with the final bit set. +//! +//! Container defines type of header and footer which can be gzip, zlib or raw. +//! They all share same deflate body. Raw has no header or footer just deflate +//! body. +//! +//! Compression algorithm explained in rfc-1951 (slightly edited for this case): +//! +//! The compressor uses a chained hash table `lookup` to find duplicated +//! strings, using a hash function that operates on 4-byte sequences. At any +//! given point during compression, let XYZW be the next 4 input bytes +//! (lookahead) to be examined (not necessarily all different, of course). +//! First, the compressor examines the hash chain for XYZW. If the chain is +//! empty, the compressor simply writes out X as a literal byte and advances +//! one byte in the input. If the hash chain is not empty, indicating that the +//! sequence XYZW (or, if we are unlucky, some other 4 bytes with the same +//! hash function value) has occurred recently, the compressor compares all +//! strings on the XYZW hash chain with the actual input data sequence +//! starting at the current point, and selects the longest match. +//! +//! To improve overall compression, the compressor defers the selection of +//! matches ("lazy matching"): after a match of length N has been found, the +//! compressor searches for a longer match starting at the next input byte. If +//! it finds a longer match, it truncates the previous match to a length of +//! one (thus producing a single literal byte) and then emits the longer +//! match. Otherwise, it emits the original match, and, as described above, +//! advances N bytes before continuing. +//! +//! +//! Allocates statically ~400K (192K lookup, 128K tokens, 64K window). +const builtin = @import("builtin"); +const std = @import("std"); +const io = std.io; +const assert = std.debug.assert; +const testing = std.testing; +const expect = testing.expect; +const mem = std.mem; +const math = std.math; + +const Compress = @This(); +const Token = @import("Token.zig"); +const BlockWriter = @import("BlockWriter.zig"); +const Container = std.compress.flate.Container; +const Lookup = @import("Lookup.zig"); +const huffman = std.compress.flate.huffman; + +pub const Options = struct { + level: Level = .default, + container: Container = .raw, +}; + +/// Trades between speed and compression size. +/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43) +/// levels 1-3 are using different algorithm to perform faster but with less +/// compression. That is not implemented here. +pub const Level = enum(u4) { + level_4 = 4, + level_5 = 5, + level_6 = 6, + level_7 = 7, + level_8 = 8, + level_9 = 9, + + fast = 0xb, + default = 0xc, + best = 0xd, +}; + +// Number of tokens to accumulate in deflate before starting block encoding. +// +// In zlib this depends on memlevel: 6 + memlevel, where default memlevel is +// 8 and max 9 that gives 14 or 15 bits. +pub const n_tokens = 1 << 15; + +/// Algorithm knobs for each level. +const LevelArgs = struct { + good: u16, // Do less lookups if we already have match of this length. + nice: u16, // Stop looking for better match if we found match with at least this length. + lazy: u16, // Don't do lazy match find if got match with at least this length. + chain: u16, // How many lookups for previous match to perform. + + pub fn get(level: Level) LevelArgs { + return switch (level) { + .fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 }, + .level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 }, + .default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 }, + .level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 }, + .level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 }, + .best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 }, + }; + } +}; + +lookup: Lookup = .{}, +tokens: Tokens = .{}, +output: *std.io.BufferedWriter, +block_writer: BlockWriter, +level: LevelArgs, +hasher: Container.Hasher, + +// Match and literal at the previous position. +// Used for lazy match finding in processWindow. +prev_match: ?Token = null, +prev_literal: ?u8 = null, + +pub fn init(output: *std.io.BufferedWriter, options: Options) std.io.Writer.Error!Compress { + try output.writeAll(options.container.header(output)); + return .{ + .output = output, + .block_writer = .init(output), + .level = .get(options.level), + .hasher = .init(options.container), + }; +} + +const FlushOption = enum { none, flush, final }; + +// Process data in window and create tokens. If token buffer is full +// flush tokens to the token writer. In the case of `flush` or `final` +// option it will process all data from the window. In the `none` case +// it will preserve some data for the next match. +fn tokenize(self: *Compress, flush_opt: FlushOption) !void { + // flush - process all data from window + const should_flush = (flush_opt != .none); + + // While there is data in active lookahead buffer. + while (self.win.activeLookahead(should_flush)) |lh| { + var step: u16 = 1; // 1 in the case of literal, match length otherwise + const pos: u16 = self.win.pos(); + const literal = lh[0]; // literal at current position + const min_len: u16 = if (self.prev_match) |m| m.length() else 0; + + // Try to find match at least min_len long. + if (self.findMatch(pos, lh, min_len)) |match| { + // Found better match than previous. + try self.addPrevLiteral(); + + // Is found match length good enough? + if (match.length() >= self.level.lazy) { + // Don't try to lazy find better match, use this. + step = try self.addMatch(match); + } else { + // Store this match. + self.prev_literal = literal; + self.prev_match = match; + } + } else { + // There is no better match at current pos then it was previous. + // Write previous match or literal. + if (self.prev_match) |m| { + // Write match from previous position. + step = try self.addMatch(m) - 1; // we already advanced 1 from previous position + } else { + // No match at previous position. + // Write previous literal if any, and remember this literal. + try self.addPrevLiteral(); + self.prev_literal = literal; + } + } + // Advance window and add hashes. + self.windowAdvance(step, lh, pos); + } + + if (should_flush) { + // In the case of flushing, last few lookahead buffers were smaller then min match len. + // So only last literal can be unwritten. + assert(self.prev_match == null); + try self.addPrevLiteral(); + self.prev_literal = null; + + try self.flushTokens(flush_opt); + } +} + +fn windowAdvance(self: *Compress, step: u16, lh: []const u8, pos: u16) void { + // current position is already added in findMatch + self.lookup.bulkAdd(lh[1..], step - 1, pos + 1); + self.win.advance(step); +} + +// Add previous literal (if any) to the tokens list. +fn addPrevLiteral(self: *Compress) !void { + if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l)); +} + +// Add match to the tokens list, reset prev pointers. +// Returns length of the added match. +fn addMatch(self: *Compress, m: Token) !u16 { + try self.addToken(m); + self.prev_literal = null; + self.prev_match = null; + return m.length(); +} + +fn addToken(self: *Compress, token: Token) !void { + self.tokens.add(token); + if (self.tokens.full()) try self.flushTokens(.none); +} + +// Finds largest match in the history window with the data at current pos. +fn findMatch(self: *Compress, pos: u16, lh: []const u8, min_len: u16) ?Token { + var len: u16 = min_len; + // Previous location with the same hash (same 4 bytes). + var prev_pos = self.lookup.add(lh, pos); + // Last found match. + var match: ?Token = null; + + // How much back-references to try, performance knob. + var chain: usize = self.level.chain; + if (len >= self.level.good) { + // If we've got a match that's good enough, only look in 1/4 the chain. + chain >>= 2; + } + + // Hot path loop! + while (prev_pos > 0 and chain > 0) : (chain -= 1) { + const distance = pos - prev_pos; + if (distance > std.compress.flate.match.max_distance) + break; + + const new_len = self.win.match(prev_pos, pos, len); + if (new_len > len) { + match = Token.initMatch(@intCast(distance), new_len); + if (new_len >= self.level.nice) { + // The match is good enough that we don't try to find a better one. + return match; + } + len = new_len; + } + prev_pos = self.lookup.prev(prev_pos); + } + + return match; +} + +fn flushTokens(self: *Compress, flush_opt: FlushOption) !void { + // Pass tokens to the token writer + try self.block_writer.write(self.tokens.tokens(), flush_opt == .final, self.win.tokensBuffer()); + // Stored block ensures byte alignment. + // It has 3 bits (final, block_type) and then padding until byte boundary. + // After that everything is aligned to the boundary in the stored block. + // Empty stored block is Ob000 + (0-7) bits of padding + 0x00 0x00 0xFF 0xFF. + // Last 4 bytes are byte aligned. + if (flush_opt == .flush) { + try self.block_writer.storedBlock("", false); + } + if (flush_opt != .none) { + // Safe to call only when byte aligned or it is OK to add + // padding bits (on last byte of the final block). + try self.block_writer.flush(); + } + // Reset internal tokens store. + self.tokens.reset(); + // Notify win that tokens are flushed. + self.win.flush(); +} + +// Slide win and if needed lookup tables. +fn slide(self: *Compress) void { + const n = self.win.slide(); + self.lookup.slide(n); +} + +/// Compresses as much data as possible, stops when the reader becomes +/// empty. It will introduce some output latency (reading input without +/// producing all output) because some data are still in internal +/// buffers. +/// +/// It is up to the caller to call flush (if needed) or finish (required) +/// when is need to output any pending data or complete stream. +/// +pub fn compress(self: *Compress, reader: anytype) !void { + while (true) { + // Fill window from reader + const buf = self.win.writable(); + if (buf.len == 0) { + try self.tokenize(.none); + self.slide(); + continue; + } + const n = try reader.readAll(buf); + self.hasher.update(buf[0..n]); + self.win.written(n); + // Process window + try self.tokenize(.none); + // Exit when no more data in reader + if (n < buf.len) break; + } +} + +/// Flushes internal buffers to the output writer. Outputs empty stored +/// block to sync bit stream to the byte boundary, so that the +/// decompressor can get all input data available so far. +/// +/// It is useful mainly in compressed network protocols, to ensure that +/// deflate bit stream can be used as byte stream. May degrade +/// compression so it should be used only when necessary. +/// +/// Completes the current deflate block and follows it with an empty +/// stored block that is three zero bits plus filler bits to the next +/// byte, followed by four bytes (00 00 ff ff). +/// +pub fn flush(self: *Compress) !void { + try self.tokenize(.flush); +} + +/// Completes deflate bit stream by writing any pending data as deflate +/// final deflate block. HAS to be called once all data are written to +/// the compressor as a signal that next block has to have final bit +/// set. +/// +pub fn finish(self: *Compress) !void { + try self.tokenize(.final); + try self.hasher.writeFooter(self.output); +} + +/// Use another writer while preserving history. Most probably flush +/// should be called on old writer before setting new. +pub fn setWriter(self: *Compress, new_writer: *std.io.BufferedWriter) void { + self.block_writer.setWriter(new_writer); + self.output = new_writer; +} + +// Tokens store +const Tokens = struct { + list: [n_tokens]Token = undefined, + pos: usize = 0, + + fn add(self: *Tokens, t: Token) void { + self.list[self.pos] = t; + self.pos += 1; + } + + fn full(self: *Tokens) bool { + return self.pos == self.list.len; + } + + fn reset(self: *Tokens) void { + self.pos = 0; + } + + fn tokens(self: *Tokens) []const Token { + return self.list[0..self.pos]; + } +}; + +/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and +/// only performs Huffman entropy encoding. Results in faster compression, much +/// less memory requirements during compression but bigger compressed sizes. +pub const Huffman = SimpleCompressor(.huffman, .raw); + +/// Creates store blocks only. Data are not compressed only packed into deflate +/// store blocks. That adds 9 bytes of header for each block. Max stored block +/// size is 64K. Block is emitted when flush is called on on finish. +pub const store = struct { + pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void { + var c = try store.compressor(container, writer); + try c.compress(reader); + try c.finish(); + } + + pub fn Compressor(comptime container: Container, comptime WriterType: type) type { + return SimpleCompressor(.store, container, WriterType); + } + + pub fn compressor(comptime container: Container, writer: anytype) !store.Compressor(container, @TypeOf(writer)) { + return try store.Compressor(container, @TypeOf(writer)).init(writer); + } +}; + +const SimpleCompressorKind = enum { + huffman, + store, +}; + +fn simpleCompressor( + comptime kind: SimpleCompressorKind, + comptime container: Container, + writer: anytype, +) !SimpleCompressor(kind, container, @TypeOf(writer)) { + return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer); +} + +fn SimpleCompressor( + comptime kind: SimpleCompressorKind, + comptime container: Container, + comptime WriterType: type, +) type { + const BlockWriterType = BlockWriter(WriterType); + return struct { + buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes + wp: usize = 0, + + output: WriterType, + block_writer: BlockWriterType, + hasher: container.Hasher() = .{}, + + const Self = @This(); + + pub fn init(output: WriterType) !Self { + const self = Self{ + .output = output, + .block_writer = BlockWriterType.init(output), + }; + try container.writeHeader(self.output); + return self; + } + + pub fn flush(self: *Self) !void { + try self.flushBuffer(false); + try self.block_writer.storedBlock("", false); + try self.block_writer.flush(); + } + + pub fn finish(self: *Self) !void { + try self.flushBuffer(true); + try self.block_writer.flush(); + try container.writeFooter(&self.hasher, self.output); + } + + fn flushBuffer(self: *Self, final: bool) !void { + const buf = self.buffer[0..self.wp]; + switch (kind) { + .huffman => try self.block_writer.huffmanBlock(buf, final), + .store => try self.block_writer.storedBlock(buf, final), + } + self.wp = 0; + } + + // Writes all data from the input reader of uncompressed data. + // It is up to the caller to call flush or finish if there is need to + // output compressed blocks. + pub fn compress(self: *Self, reader: anytype) !void { + while (true) { + // read from rdr into buffer + const buf = self.buffer[self.wp..]; + if (buf.len == 0) { + try self.flushBuffer(false); + continue; + } + const n = try reader.readAll(buf); + self.hasher.update(buf[0..n]); + self.wp += n; + if (n < buf.len) break; // no more data in reader + } + } + }; +} + +const LiteralNode = struct { + literal: u16, + freq: u16, +}; + +// Describes the state of the constructed tree for a given depth. +const LevelInfo = struct { + // Our level. for better printing + level: u32, + + // The frequency of the last node at this level + last_freq: u32, + + // The frequency of the next character to add to this level + next_char_freq: u32, + + // The frequency of the next pair (from level below) to add to this level. + // Only valid if the "needed" value of the next lower level is 0. + next_pair_freq: u32, + + // The number of chains remaining to generate for this level before moving + // up to the next level + needed: u32, +}; + +// hcode is a huffman code with a bit code and bit length. +pub const HuffCode = struct { + code: u16 = 0, + len: u16 = 0, + + // set sets the code and length of an hcode. + fn set(self: *HuffCode, code: u16, length: u16) void { + self.len = length; + self.code = code; + } +}; + +pub fn HuffmanEncoder(comptime size: usize) type { + return struct { + codes: [size]HuffCode = undefined, + // Reusable buffer with the longest possible frequency table. + freq_cache: [huffman.max_num_frequencies + 1]LiteralNode = undefined, + bit_count: [17]u32 = undefined, + lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate + lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate + + const Self = @This(); + + // Update this Huffman Code object to be the minimum code for the specified frequency count. + // + // freq An array of frequencies, in which frequency[i] gives the frequency of literal i. + // max_bits The maximum number of bits to use for any literal. + pub fn generate(self: *Self, freq: []u16, max_bits: u32) void { + var list = self.freq_cache[0 .. freq.len + 1]; + // Number of non-zero literals + var count: u32 = 0; + // Set list to be the set of all non-zero literals and their frequencies + for (freq, 0..) |f, i| { + if (f != 0) { + list[count] = LiteralNode{ .literal = @as(u16, @intCast(i)), .freq = f }; + count += 1; + } else { + list[count] = LiteralNode{ .literal = 0x00, .freq = 0 }; + self.codes[i].len = 0; + } + } + list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 }; + + list = list[0..count]; + if (count <= 2) { + // Handle the small cases here, because they are awkward for the general case code. With + // two or fewer literals, everything has bit length 1. + for (list, 0..) |node, i| { + // "list" is in order of increasing literal value. + self.codes[node.literal].set(@as(u16, @intCast(i)), 1); + } + return; + } + self.lfs = list; + mem.sort(LiteralNode, self.lfs, {}, byFreq); + + // Get the number of literals for each bit count + const bit_count = self.bitCounts(list, max_bits); + // And do the assignment + self.assignEncodingAndSize(bit_count, list); + } + + pub fn bitLength(self: *Self, freq: []u16) u32 { + var total: u32 = 0; + for (freq, 0..) |f, i| { + if (f != 0) { + total += @as(u32, @intCast(f)) * @as(u32, @intCast(self.codes[i].len)); + } + } + return total; + } + + // Return the number of literals assigned to each bit size in the Huffman encoding + // + // This method is only called when list.len >= 3 + // The cases of 0, 1, and 2 literals are handled by special case code. + // + // list: An array of the literals with non-zero frequencies + // and their associated frequencies. The array is in order of increasing + // frequency, and has as its last element a special element with frequency + // `math.maxInt(i32)` + // + // max_bits: The maximum number of bits that should be used to encode any literal. + // Must be less than 16. + // + // Returns an integer array in which array[i] indicates the number of literals + // that should be encoded in i bits. + fn bitCounts(self: *Self, list: []LiteralNode, max_bits_to_use: usize) []u32 { + var max_bits = max_bits_to_use; + const n = list.len; + const max_bits_limit = 16; + + assert(max_bits < max_bits_limit); + + // The tree can't have greater depth than n - 1, no matter what. This + // saves a little bit of work in some small cases + max_bits = @min(max_bits, n - 1); + + // Create information about each of the levels. + // A bogus "Level 0" whose sole purpose is so that + // level1.prev.needed == 0. This makes level1.next_pair_freq + // be a legitimate value that never gets chosen. + var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo); + // leaf_counts[i] counts the number of literals at the left + // of ancestors of the rightmost node at level i. + // leaf_counts[i][j] is the number of literals at the left + // of the level j ancestor. + var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32); + + { + var level = @as(u32, 1); + while (level <= max_bits) : (level += 1) { + // For every level, the first two items are the first two characters. + // We initialize the levels as if we had already figured this out. + levels[level] = LevelInfo{ + .level = level, + .last_freq = list[1].freq, + .next_char_freq = list[2].freq, + .next_pair_freq = list[0].freq + list[1].freq, + .needed = 0, + }; + leaf_counts[level][level] = 2; + if (level == 1) { + levels[level].next_pair_freq = math.maxInt(i32); + } + } + } + + // We need a total of 2*n - 2 items at top level and have already generated 2. + levels[max_bits].needed = 2 * @as(u32, @intCast(n)) - 4; + + { + var level = max_bits; + while (true) { + var l = &levels[level]; + if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) { + // We've run out of both leaves and pairs. + // End all calculations for this level. + // To make sure we never come back to this level or any lower level, + // set next_pair_freq impossibly large. + l.needed = 0; + levels[level + 1].next_pair_freq = math.maxInt(i32); + level += 1; + continue; + } + + const prev_freq = l.last_freq; + if (l.next_char_freq < l.next_pair_freq) { + // The next item on this row is a leaf node. + const next = leaf_counts[level][level] + 1; + l.last_freq = l.next_char_freq; + // Lower leaf_counts are the same of the previous node. + leaf_counts[level][level] = next; + if (next >= list.len) { + l.next_char_freq = maxNode().freq; + } else { + l.next_char_freq = list[next].freq; + } + } else { + // The next item on this row is a pair from the previous row. + // next_pair_freq isn't valid until we generate two + // more values in the level below + l.last_freq = l.next_pair_freq; + // Take leaf counts from the lower level, except counts[level] remains the same. + @memcpy(leaf_counts[level][0..level], leaf_counts[level - 1][0..level]); + levels[l.level - 1].needed = 2; + } + + l.needed -= 1; + if (l.needed == 0) { + // We've done everything we need to do for this level. + // Continue calculating one level up. Fill in next_pair_freq + // of that level with the sum of the two nodes we've just calculated on + // this level. + if (l.level == max_bits) { + // All done! + break; + } + levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq; + level += 1; + } else { + // If we stole from below, move down temporarily to replenish it. + while (levels[level - 1].needed > 0) { + level -= 1; + if (level == 0) { + break; + } + } + } + } + } + + // Somethings is wrong if at the end, the top level is null or hasn't used + // all of the leaves. + assert(leaf_counts[max_bits][max_bits] == n); + + var bit_count = self.bit_count[0 .. max_bits + 1]; + var bits: u32 = 1; + const counts = &leaf_counts[max_bits]; + { + var level = max_bits; + while (level > 0) : (level -= 1) { + // counts[level] gives the number of literals requiring at least "bits" + // bits to encode. + bit_count[bits] = counts[level] - counts[level - 1]; + bits += 1; + if (level == 0) { + break; + } + } + } + return bit_count; + } + + // Look at the leaves and assign them a bit count and an encoding as specified + // in RFC 1951 3.2.2 + fn assignEncodingAndSize(self: *Self, bit_count: []u32, list_arg: []LiteralNode) void { + var code = @as(u16, 0); + var list = list_arg; + + for (bit_count, 0..) |bits, n| { + code <<= 1; + if (n == 0 or bits == 0) { + continue; + } + // The literals list[list.len-bits] .. list[list.len-bits] + // are encoded using "bits" bits, and get the values + // code, code + 1, .... The code values are + // assigned in literal order (not frequency order). + const chunk = list[list.len - @as(u32, @intCast(bits)) ..]; + + self.lns = chunk; + mem.sort(LiteralNode, self.lns, {}, byLiteral); + + for (chunk) |node| { + self.codes[node.literal] = HuffCode{ + .code = bitReverse(u16, code, @as(u5, @intCast(n))), + .len = @as(u16, @intCast(n)), + }; + code += 1; + } + list = list[0 .. list.len - @as(u32, @intCast(bits))]; + } + } + }; +} + +fn maxNode() LiteralNode { + return LiteralNode{ + .literal = math.maxInt(u16), + .freq = math.maxInt(u16), + }; +} + +pub fn huffmanEncoder(comptime size: u32) HuffmanEncoder(size) { + return .{}; +} + +pub const LiteralEncoder = HuffmanEncoder(huffman.max_num_frequencies); +pub const DistanceEncoder = HuffmanEncoder(huffman.distance_code_count); +pub const CodegenEncoder = HuffmanEncoder(19); + +// Generates a HuffmanCode corresponding to the fixed literal table +pub fn fixedLiteralEncoder() LiteralEncoder { + var h: LiteralEncoder = undefined; + var ch: u16 = 0; + + while (ch < huffman.max_num_frequencies) : (ch += 1) { + var bits: u16 = undefined; + var size: u16 = undefined; + switch (ch) { + 0...143 => { + // size 8, 000110000 .. 10111111 + bits = ch + 48; + size = 8; + }, + 144...255 => { + // size 9, 110010000 .. 111111111 + bits = ch + 400 - 144; + size = 9; + }, + 256...279 => { + // size 7, 0000000 .. 0010111 + bits = ch - 256; + size = 7; + }, + else => { + // size 8, 11000000 .. 11000111 + bits = ch + 192 - 280; + size = 8; + }, + } + h.codes[ch] = HuffCode{ .code = bitReverse(u16, bits, @as(u5, @intCast(size))), .len = size }; + } + return h; +} + +pub fn fixedDistanceEncoder() DistanceEncoder { + var h: DistanceEncoder = undefined; + for (h.codes, 0..) |_, ch| { + h.codes[ch] = HuffCode{ .code = bitReverse(u16, @as(u16, @intCast(ch)), 5), .len = 5 }; + } + return h; +} + +pub fn huffmanDistanceEncoder() DistanceEncoder { + var distance_freq = [1]u16{0} ** huffman.distance_code_count; + distance_freq[0] = 1; + // huff_distance is a static distance encoder used for huffman only encoding. + // It can be reused since we will not be encoding distance values. + var h: DistanceEncoder = .{}; + h.generate(distance_freq[0..], 15); + return h; +} + +fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool { + _ = context; + return a.literal < b.literal; +} + +fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool { + _ = context; + if (a.freq == b.freq) { + return a.literal < b.literal; + } + return a.freq < b.freq; +} + +test "generate a Huffman code from an array of frequencies" { + var freqs: [19]u16 = [_]u16{ + 8, // 0 + 1, // 1 + 1, // 2 + 2, // 3 + 5, // 4 + 10, // 5 + 9, // 6 + 1, // 7 + 0, // 8 + 0, // 9 + 0, // 10 + 0, // 11 + 0, // 12 + 0, // 13 + 0, // 14 + 0, // 15 + 1, // 16 + 3, // 17 + 5, // 18 + }; + + var enc = huffmanEncoder(19); + enc.generate(freqs[0..], 7); + + try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..])); + + try testing.expectEqual(@as(usize, 3), enc.codes[0].len); + try testing.expectEqual(@as(usize, 6), enc.codes[1].len); + try testing.expectEqual(@as(usize, 6), enc.codes[2].len); + try testing.expectEqual(@as(usize, 5), enc.codes[3].len); + try testing.expectEqual(@as(usize, 3), enc.codes[4].len); + try testing.expectEqual(@as(usize, 2), enc.codes[5].len); + try testing.expectEqual(@as(usize, 2), enc.codes[6].len); + try testing.expectEqual(@as(usize, 6), enc.codes[7].len); + try testing.expectEqual(@as(usize, 0), enc.codes[8].len); + try testing.expectEqual(@as(usize, 0), enc.codes[9].len); + try testing.expectEqual(@as(usize, 0), enc.codes[10].len); + try testing.expectEqual(@as(usize, 0), enc.codes[11].len); + try testing.expectEqual(@as(usize, 0), enc.codes[12].len); + try testing.expectEqual(@as(usize, 0), enc.codes[13].len); + try testing.expectEqual(@as(usize, 0), enc.codes[14].len); + try testing.expectEqual(@as(usize, 0), enc.codes[15].len); + try testing.expectEqual(@as(usize, 6), enc.codes[16].len); + try testing.expectEqual(@as(usize, 5), enc.codes[17].len); + try testing.expectEqual(@as(usize, 3), enc.codes[18].len); + + try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code); + try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code); + try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code); + try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code); + try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code); + try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code); + try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code); + try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code); + try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code); + try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code); + try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code); +} + +test "generate a Huffman code for the fixed literal table specific to Deflate" { + const enc = fixedLiteralEncoder(); + for (enc.codes) |c| { + switch (c.len) { + 7 => { + const v = @bitReverse(@as(u7, @intCast(c.code))); + try testing.expect(v <= 0b0010111); + }, + 8 => { + const v = @bitReverse(@as(u8, @intCast(c.code))); + try testing.expect((v >= 0b000110000 and v <= 0b10111111) or + (v >= 0b11000000 and v <= 11000111)); + }, + 9 => { + const v = @bitReverse(@as(u9, @intCast(c.code))); + try testing.expect(v >= 0b110010000 and v <= 0b111111111); + }, + else => unreachable, + } + } +} + +test "generate a Huffman code for the 30 possible relative distances (LZ77 distances) of Deflate" { + const enc = fixedDistanceEncoder(); + for (enc.codes) |c| { + const v = @bitReverse(@as(u5, @intCast(c.code))); + try testing.expect(v <= 29); + try testing.expect(c.len == 5); + } +} + +// Reverse bit-by-bit a N-bit code. +fn bitReverse(comptime T: type, value: T, n: usize) T { + const r = @bitReverse(value); + return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).int.bits - n)); +} + +test bitReverse { + const ReverseBitsTest = struct { + in: u16, + bit_count: u5, + out: u16, + }; + + const reverse_bits_tests = [_]ReverseBitsTest{ + .{ .in = 1, .bit_count = 1, .out = 1 }, + .{ .in = 1, .bit_count = 2, .out = 2 }, + .{ .in = 1, .bit_count = 3, .out = 4 }, + .{ .in = 1, .bit_count = 4, .out = 8 }, + .{ .in = 1, .bit_count = 5, .out = 16 }, + .{ .in = 17, .bit_count = 5, .out = 17 }, + .{ .in = 257, .bit_count = 9, .out = 257 }, + .{ .in = 29, .bit_count = 5, .out = 23 }, + }; + + for (reverse_bits_tests) |h| { + const v = bitReverse(u16, h.in, h.bit_count); + try std.testing.expectEqual(h.out, v); + } +} + +test "fixedLiteralEncoder codes" { + var al = std.ArrayList(u8).init(testing.allocator); + defer al.deinit(); + var bw = std.io.bitWriter(.little, al.writer()); + + const f = fixedLiteralEncoder(); + for (f.codes) |c| { + try bw.writeBits(c.code, c.len); + } + try testing.expectEqualSlices(u8, &fixed_codes, al.items); +} + +pub const fixed_codes = [_]u8{ + 0b00001100, 0b10001100, 0b01001100, 0b11001100, 0b00101100, 0b10101100, 0b01101100, 0b11101100, + 0b00011100, 0b10011100, 0b01011100, 0b11011100, 0b00111100, 0b10111100, 0b01111100, 0b11111100, + 0b00000010, 0b10000010, 0b01000010, 0b11000010, 0b00100010, 0b10100010, 0b01100010, 0b11100010, + 0b00010010, 0b10010010, 0b01010010, 0b11010010, 0b00110010, 0b10110010, 0b01110010, 0b11110010, + 0b00001010, 0b10001010, 0b01001010, 0b11001010, 0b00101010, 0b10101010, 0b01101010, 0b11101010, + 0b00011010, 0b10011010, 0b01011010, 0b11011010, 0b00111010, 0b10111010, 0b01111010, 0b11111010, + 0b00000110, 0b10000110, 0b01000110, 0b11000110, 0b00100110, 0b10100110, 0b01100110, 0b11100110, + 0b00010110, 0b10010110, 0b01010110, 0b11010110, 0b00110110, 0b10110110, 0b01110110, 0b11110110, + 0b00001110, 0b10001110, 0b01001110, 0b11001110, 0b00101110, 0b10101110, 0b01101110, 0b11101110, + 0b00011110, 0b10011110, 0b01011110, 0b11011110, 0b00111110, 0b10111110, 0b01111110, 0b11111110, + 0b00000001, 0b10000001, 0b01000001, 0b11000001, 0b00100001, 0b10100001, 0b01100001, 0b11100001, + 0b00010001, 0b10010001, 0b01010001, 0b11010001, 0b00110001, 0b10110001, 0b01110001, 0b11110001, + 0b00001001, 0b10001001, 0b01001001, 0b11001001, 0b00101001, 0b10101001, 0b01101001, 0b11101001, + 0b00011001, 0b10011001, 0b01011001, 0b11011001, 0b00111001, 0b10111001, 0b01111001, 0b11111001, + 0b00000101, 0b10000101, 0b01000101, 0b11000101, 0b00100101, 0b10100101, 0b01100101, 0b11100101, + 0b00010101, 0b10010101, 0b01010101, 0b11010101, 0b00110101, 0b10110101, 0b01110101, 0b11110101, + 0b00001101, 0b10001101, 0b01001101, 0b11001101, 0b00101101, 0b10101101, 0b01101101, 0b11101101, + 0b00011101, 0b10011101, 0b01011101, 0b11011101, 0b00111101, 0b10111101, 0b01111101, 0b11111101, + 0b00010011, 0b00100110, 0b01001110, 0b10011010, 0b00111100, 0b01100101, 0b11101010, 0b10110100, + 0b11101001, 0b00110011, 0b01100110, 0b11001110, 0b10011010, 0b00111101, 0b01100111, 0b11101110, + 0b10111100, 0b11111001, 0b00001011, 0b00010110, 0b00101110, 0b01011010, 0b10111100, 0b01100100, + 0b11101001, 0b10110010, 0b11100101, 0b00101011, 0b01010110, 0b10101110, 0b01011010, 0b10111101, + 0b01100110, 0b11101101, 0b10111010, 0b11110101, 0b00011011, 0b00110110, 0b01101110, 0b11011010, + 0b10111100, 0b01100101, 0b11101011, 0b10110110, 0b11101101, 0b00111011, 0b01110110, 0b11101110, + 0b11011010, 0b10111101, 0b01100111, 0b11101111, 0b10111110, 0b11111101, 0b00000111, 0b00001110, + 0b00011110, 0b00111010, 0b01111100, 0b11100100, 0b11101000, 0b10110001, 0b11100011, 0b00100111, + 0b01001110, 0b10011110, 0b00111010, 0b01111101, 0b11100110, 0b11101100, 0b10111001, 0b11110011, + 0b00010111, 0b00101110, 0b01011110, 0b10111010, 0b01111100, 0b11100101, 0b11101010, 0b10110101, + 0b11101011, 0b00110111, 0b01101110, 0b11011110, 0b10111010, 0b01111101, 0b11100111, 0b11101110, + 0b10111101, 0b11111011, 0b00001111, 0b00011110, 0b00111110, 0b01111010, 0b11111100, 0b11100100, + 0b11101001, 0b10110011, 0b11100111, 0b00101111, 0b01011110, 0b10111110, 0b01111010, 0b11111101, + 0b11100110, 0b11101101, 0b10111011, 0b11110111, 0b00011111, 0b00111110, 0b01111110, 0b11111010, + 0b11111100, 0b11100101, 0b11101011, 0b10110111, 0b11101111, 0b00111111, 0b01111110, 0b11111110, + 0b11111010, 0b11111101, 0b11100111, 0b11101111, 0b10111111, 0b11111111, 0b00000000, 0b00100000, + 0b00001000, 0b00001100, 0b10000001, 0b11000010, 0b11100000, 0b00001000, 0b00100100, 0b00001010, + 0b10001101, 0b11000001, 0b11100010, 0b11110000, 0b00000100, 0b00100010, 0b10001001, 0b01001100, + 0b10100001, 0b11010010, 0b11101000, 0b00000011, 0b10000011, 0b01000011, 0b11000011, 0b00100011, + 0b10100011, +}; + +test "tokenization" { + const L = Token.initLiteral; + const M = Token.initMatch; + + const cases = [_]struct { + data: []const u8, + tokens: []const Token, + }{ + .{ + .data = "Blah blah blah blah blah!", + .tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') }, + }, + .{ + .data = "ABCDEABCD ABCDEABCD", + .tokens = &[_]Token{ + L('A'), L('B'), L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '), + L('A'), M(10, 8), + }, + }, + }; + + for (cases) |c| { + inline for (Container.list) |container| { // for each wrapping + + var cw = io.countingWriter(io.null_writer); + const cww = cw.writer(); + var df = try Compress(container, @TypeOf(cww), TestTokenWriter).init(cww, .{}); + + _ = try df.write(c.data); + try df.flush(); + + // df.token_writer.show(); + try expect(df.block_writer.pos == c.tokens.len); // number of tokens written + try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match + + try testing.expectEqual(container.headerSize(), cw.bytes_written); + try df.finish(); + try testing.expectEqual(container.size(), cw.bytes_written); + } + } +} + +// Tests that tokens written are equal to expected token list. +const TestTokenWriter = struct { + const Self = @This(); + + pos: usize = 0, + actual: [128]Token = undefined, + + pub fn init(_: anytype) Self { + return .{}; + } + pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void { + for (tokens) |t| { + self.actual[self.pos] = t; + self.pos += 1; + } + } + + pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {} + + pub fn get(self: *Self) []Token { + return self.actual[0..self.pos]; + } + + pub fn show(self: *Self) void { + std.debug.print("\n", .{}); + for (self.get()) |t| { + t.show(); + } + } + + pub fn flush(_: *Self) !void {} +}; + +test "file tokenization" { + const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 }; + const cases = [_]struct { + data: []const u8, // uncompressed content + // expected number of tokens producet in deflate tokenization + tokens_count: [levels.len]usize = .{0} ** levels.len, + }{ + .{ + .data = @embedFile("testdata/rfc1951.txt"), + .tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 }, + }, + + .{ + .data = @embedFile("testdata/block_writer/huffman-null-max.input"), + .tokens_count = .{ 257, 257, 257, 257, 257, 257 }, + }, + .{ + .data = @embedFile("testdata/block_writer/huffman-pi.input"), + .tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 }, + }, + .{ + .data = @embedFile("testdata/block_writer/huffman-text.input"), + .tokens_count = .{ 235, 234, 234, 234, 234, 234 }, + }, + .{ + .data = @embedFile("testdata/fuzz/roundtrip1.input"), + .tokens_count = .{ 333, 331, 331, 331, 331, 331 }, + }, + .{ + .data = @embedFile("testdata/fuzz/roundtrip2.input"), + .tokens_count = .{ 334, 334, 334, 334, 334, 334 }, + }, + }; + + for (cases) |case| { // for each case + const data = case.data; + + for (levels, 0..) |level, i| { // for each compression level + var original = io.fixedBufferStream(data); + + // buffer for decompressed data + var al = std.ArrayList(u8).init(testing.allocator); + defer al.deinit(); + const writer = al.writer(); + + // create compressor + const WriterType = @TypeOf(writer); + const TokenWriter = TokenDecoder(@TypeOf(writer)); + var cmp = try Compress(.raw, WriterType, TokenWriter).init(writer, .{ .level = level }); + + // Stream uncompressed `original` data to the compressor. It will + // produce tokens list and pass that list to the TokenDecoder. This + // TokenDecoder uses CircularBuffer from inflate to convert list of + // tokens back to the uncompressed stream. + try cmp.compress(original.reader()); + try cmp.flush(); + const expected_count = case.tokens_count[i]; + const actual = cmp.block_writer.tokens_count; + if (expected_count == 0) { + std.debug.print("actual token count {d}\n", .{actual}); + } else { + try testing.expectEqual(expected_count, actual); + } + + try testing.expectEqual(data.len, al.items.len); + try testing.expectEqualSlices(u8, data, al.items); + } + } +} + +const TokenDecoder = struct { + output: *std.io.BufferedWriter, + tokens_count: usize, + + pub fn init(output: *std.io.BufferedWriter) TokenDecoder { + return .{ + .output = output, + .tokens_count = 0, + }; + } + + pub fn write(self: *TokenDecoder, tokens: []const Token, _: bool, _: ?[]const u8) !void { + self.tokens_count += tokens.len; + for (tokens) |t| { + switch (t.kind) { + .literal => self.hist.write(t.literal()), + .match => try self.hist.writeMatch(t.length(), t.distance()), + } + if (self.hist.free() < 285) try self.flushWin(); + } + try self.flushWin(); + } + + fn flushWin(self: *TokenDecoder) !void { + while (true) { + const buf = self.hist.read(); + if (buf.len == 0) break; + try self.output.writeAll(buf); + } + } +}; + +test "store simple compressor" { + const data = "Hello world!"; + const expected = [_]u8{ + 0x1, // block type 0, final bit set + 0xc, 0x0, // len = 12 + 0xf3, 0xff, // ~len + 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', // + //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, + }; + + var fbs = std.io.fixedBufferStream(data); + var al = std.ArrayList(u8).init(testing.allocator); + defer al.deinit(); + + var cmp = try store.compressor(.raw, al.writer()); + try cmp.compress(fbs.reader()); + try cmp.finish(); + try testing.expectEqualSlices(u8, &expected, al.items); + + fbs.reset(); + try al.resize(0); + + // huffman only compresoor will also emit store block for this small sample + var hc = try huffman.compressor(.raw, al.writer()); + try hc.compress(fbs.reader()); + try hc.finish(); + try testing.expectEqualSlices(u8, &expected, al.items); +} + +test "sliding window match" { + const data = "Blah blah blah blah blah!"; + var win: std.io.BufferedWriter = .{}; + try expect(win.write(data) == data.len); + try expect(win.wp == data.len); + try expect(win.rp == 0); + + // length between l symbols + try expect(win.match(1, 6, 0) == 18); + try expect(win.match(1, 11, 0) == 13); + try expect(win.match(1, 16, 0) == 8); + try expect(win.match(1, 21, 0) == 0); + + // position 15 = "blah blah!" + // position 20 = "blah!" + try expect(win.match(15, 20, 0) == 4); + try expect(win.match(15, 20, 3) == 4); + try expect(win.match(15, 20, 4) == 0); +} + +test "sliding window slide" { + var win: std.io.BufferedWriter = .{}; + win.wp = std.io.BufferedWriter.buffer_len - 11; + win.rp = std.io.BufferedWriter.buffer_len - 111; + win.buffer[win.rp] = 0xab; + try expect(win.lookahead().len == 100); + try expect(win.tokensBuffer().?.len == win.rp); + + const n = win.slide(); + try expect(n == 32757); + try expect(win.buffer[win.rp] == 0xab); + try expect(win.rp == std.io.BufferedWriter.hist_len - 111); + try expect(win.wp == std.io.BufferedWriter.hist_len - 11); + try expect(win.lookahead().len == 100); + try expect(win.tokensBuffer() == null); +} diff --git a/lib/std/compress/flate/Decompress.zig b/lib/std/compress/flate/Decompress.zig new file mode 100644 index 0000000000..76a4566d76 --- /dev/null +++ b/lib/std/compress/flate/Decompress.zig @@ -0,0 +1,891 @@ +//! Inflate decompresses deflate bit stream. Reads compressed data from reader +//! provided in init. Decompressed data are stored in internal hist buffer and +//! can be accesses iterable `next` or reader interface. +//! +//! Container defines header/footer wrapper around deflate bit stream. Can be +//! gzip or zlib. +//! +//! Deflate bit stream consists of multiple blocks. Block can be one of three types: +//! * stored, non compressed, max 64k in size +//! * fixed, huffman codes are predefined +//! * dynamic, huffman code tables are encoded at the block start +//! +//! `step` function runs decoder until internal `hist` buffer is full. Client than needs to read +//! that data in order to proceed with decoding. +//! +//! Allocates 74.5K of internal buffers, most important are: +//! * 64K for history (CircularBuffer) +//! * ~10K huffman decoders (Literal and DistanceDecoder) + +const std = @import("../../std.zig"); +const flate = std.compress.flate; +const Container = flate.Container; +const Token = @import("Token.zig"); +const testing = std.testing; + +input: *std.io.BufferedReader, +// Hashes, produces checksum, of uncompressed data for gzip/zlib footer. +hasher: Container.Hasher(), + +// dynamic block huffman code decoders +lit_dec: LiteralDecoder, +dst_dec: DistanceDecoder, + +// current read state +bfinal: u1, +block_type: u2, +state: ReadState, + +read_err: Error!void, + +const ReadState = enum { + protocol_header, + block_header, + block, + protocol_footer, + end, +}; + +const Decompress = @This(); + +pub const Error = Container.Error || error{ + InvalidCode, + InvalidMatch, + InvalidBlockType, + WrongStoredBlockNlen, + InvalidDynamicBlockHeader, + EndOfStream, + ReadFailed, + OversubscribedHuffmanTree, + IncompleteHuffmanTree, + MissingEndOfBlockCode, +}; + +pub fn init(input: *std.io.BufferedReader) Decompress { + return .{ + .input = input, + .hasher = .{}, + .lit_dec = .{}, + .dst_dec = .{}, + .bfinal = 0, + .block_type = 0b11, + .state = .protocol_header, + .read_err = {}, + }; +} + +fn blockHeader(self: *Decompress) Error!void { + self.bfinal = try self.bits.read(u1); + self.block_type = try self.bits.read(u2); +} + +fn storedBlock(self: *Decompress) !bool { + self.bits.alignToByte(); // skip padding until byte boundary + // everything after this is byte aligned in stored block + var len = try self.bits.read(u16); + const nlen = try self.bits.read(u16); + if (len != ~nlen) return error.WrongStoredBlockNlen; + + while (len > 0) { + const buf = self.hist.getWritable(len); + try self.bits.readAll(buf); + len -= @intCast(buf.len); + } + return true; +} + +fn fixedBlock(self: *Decompress) !bool { + while (!self.hist.full()) { + const code = try self.bits.readFixedCode(); + switch (code) { + 0...255 => self.hist.write(@intCast(code)), + 256 => return true, // end of block + 257...285 => try self.fixedDistanceCode(@intCast(code - 257)), + else => return error.InvalidCode, + } + } + return false; +} + +// Handles fixed block non literal (length) code. +// Length code is followed by 5 bits of distance code. +fn fixedDistanceCode(self: *Decompress, code: u8) !void { + try self.bits.fill(5 + 5 + 13); + const length = try self.decodeLength(code); + const distance = try self.decodeDistance(try self.bits.readF(u5, .{ + .buffered = true, + .reverse = true, + })); + try self.hist.writeMatch(length, distance); +} + +fn decodeLength(self: *Decompress, code: u8) !u16 { + if (code > 28) return error.InvalidCode; + const ml = Token.matchLength(code); + return if (ml.extra_bits == 0) // 0 - 5 extra bits + ml.base + else + ml.base + try self.bits.readN(ml.extra_bits, .{ .buffered = true }); +} + +fn decodeDistance(self: *Decompress, code: u8) !u16 { + if (code > 29) return error.InvalidCode; + const md = Token.matchDistance(code); + return if (md.extra_bits == 0) // 0 - 13 extra bits + md.base + else + md.base + try self.bits.readN(md.extra_bits, .{ .buffered = true }); +} + +fn dynamicBlockHeader(self: *Decompress) !void { + const hlit: u16 = @as(u16, try self.bits.read(u5)) + 257; // number of ll code entries present - 257 + const hdist: u16 = @as(u16, try self.bits.read(u5)) + 1; // number of distance code entries - 1 + const hclen: u8 = @as(u8, try self.bits.read(u4)) + 4; // hclen + 4 code lengths are encoded + + if (hlit > 286 or hdist > 30) + return error.InvalidDynamicBlockHeader; + + // lengths for code lengths + var cl_lens = [_]u4{0} ** 19; + for (0..hclen) |i| { + cl_lens[flate.huffman.codegen_order[i]] = try self.bits.read(u3); + } + var cl_dec: CodegenDecoder = .{}; + try cl_dec.generate(&cl_lens); + + // decoded code lengths + var dec_lens = [_]u4{0} ** (286 + 30); + var pos: usize = 0; + while (pos < hlit + hdist) { + const sym = try cl_dec.find(try self.bits.peekF(u7, .{ .reverse = true })); + try self.bits.shift(sym.code_bits); + pos += try self.dynamicCodeLength(sym.symbol, &dec_lens, pos); + } + if (pos > hlit + hdist) { + return error.InvalidDynamicBlockHeader; + } + + // literal code lengths to literal decoder + try self.lit_dec.generate(dec_lens[0..hlit]); + + // distance code lengths to distance decoder + try self.dst_dec.generate(dec_lens[hlit .. hlit + hdist]); +} + +// Decode code length symbol to code length. Writes decoded length into +// lens slice starting at position pos. Returns number of positions +// advanced. +fn dynamicCodeLength(self: *Decompress, code: u16, lens: []u4, pos: usize) !usize { + if (pos >= lens.len) + return error.InvalidDynamicBlockHeader; + + switch (code) { + 0...15 => { + // Represent code lengths of 0 - 15 + lens[pos] = @intCast(code); + return 1; + }, + 16 => { + // Copy the previous code length 3 - 6 times. + // The next 2 bits indicate repeat length + const n: u8 = @as(u8, try self.bits.read(u2)) + 3; + if (pos == 0 or pos + n > lens.len) + return error.InvalidDynamicBlockHeader; + for (0..n) |i| { + lens[pos + i] = lens[pos + i - 1]; + } + return n; + }, + // Repeat a code length of 0 for 3 - 10 times. (3 bits of length) + 17 => return @as(u8, try self.bits.read(u3)) + 3, + // Repeat a code length of 0 for 11 - 138 times (7 bits of length) + 18 => return @as(u8, try self.bits.read(u7)) + 11, + else => return error.InvalidDynamicBlockHeader, + } +} + +// In larger archives most blocks are usually dynamic, so decompression +// performance depends on this function. +fn dynamicBlock(self: *Decompress) !bool { + // Hot path loop! + while (!self.hist.full()) { + // optimization so other bit reads can be buffered (avoiding one `if` in hot path) + try self.bits.fill(15); + const sym = try self.decodeSymbol(&self.lit_dec); + + switch (sym.kind) { + .literal => self.hist.write(sym.symbol), + .match => { + // Decode match backreference + try self.bits.fill(5 + 15 + 13); + const length = try self.decodeLength(sym.symbol); + const dsm = try self.decodeSymbol(&self.dst_dec); + const distance = try self.decodeDistance(dsm.symbol); + try self.hist.writeMatch(length, distance); + }, + .end_of_block => return true, + } + } + return false; +} + +// Peek 15 bits from bits reader (maximum code len is 15 bits). Use +// decoder to find symbol for that code. We then know how many bits is +// used. Shift bit reader for that much bits, those bits are used. And +// return symbol. +fn decodeSymbol(self: *Decompress, decoder: anytype) !Symbol { + const sym = try decoder.find(try self.bits.peekF(u15, .{ .buffered = true, .reverse = true })); + try self.bits.shift(sym.code_bits); + return sym; +} + +fn step(self: *Decompress) !void { + switch (self.state) { + .protocol_header => { + try self.hasher.container().parseHeader(&self.bits); + self.state = .block_header; + }, + .block_header => { + try self.blockHeader(); + self.state = .block; + if (self.block_type == 2) try self.dynamicBlockHeader(); + }, + .block => { + const done = switch (self.block_type) { + 0 => try self.storedBlock(), + 1 => try self.fixedBlock(), + 2 => try self.dynamicBlock(), + else => return error.InvalidBlockType, + }; + if (done) { + self.state = if (self.bfinal == 1) .protocol_footer else .block_header; + } + }, + .protocol_footer => { + self.bits.alignToByte(); + try self.hasher.container().parseFooter(&self.hasher, &self.bits); + self.state = .end; + }, + .end => {}, + } +} + +/// Replaces the inner reader with new reader. +pub fn setReader(self: *Decompress, new_reader: *std.io.BufferedReader) void { + self.bits.forward_reader = new_reader; + if (self.state == .end or self.state == .protocol_footer) { + self.state = .protocol_header; + } +} + +// Reads all compressed data from the internal reader and outputs plain +// (uncompressed) data to the provided writer. +pub fn decompress(self: *Decompress, writer: *std.io.BufferedWriter) !void { + while (try self.next()) |buf| { + try writer.writeAll(buf); + } +} + +/// Returns the number of bytes that have been read from the internal +/// reader but not yet consumed by the decompressor. +pub fn unreadBytes(self: Decompress) usize { + // There can be no error here: the denominator is not zero, and + // overflow is not possible since the type is unsigned. + return std.math.divCeil(usize, self.bits.nbits, 8) catch unreachable; +} + +// Iterator interface + +/// Can be used in iterator like loop without memcpy to another buffer: +/// while (try inflate.next()) |buf| { ... } +pub fn next(self: *Decompress) Error!?[]const u8 { + const out = try self.get(0); + if (out.len == 0) return null; + return out; +} + +/// Returns decompressed data from internal sliding window buffer. +/// Returned buffer can be any length between 0 and `limit` bytes. 0 +/// returned bytes means end of stream reached. With limit=0 returns as +/// much data it can. It newer will be more than 65536 bytes, which is +/// size of internal buffer. +/// TODO merge this logic into readerRead and readerReadVec +pub fn get(self: *Decompress, limit: usize) Error![]const u8 { + while (true) { + const out = self.hist.readAtMost(limit); + if (out.len > 0) { + self.hasher.update(out); + return out; + } + if (self.state == .end) return out; + try self.step(); + } +} + +fn readerRead( + context: ?*anyopaque, + bw: *std.io.BufferedWriter, + limit: std.io.Reader.Limit, +) std.io.Reader.RwError!usize { + const self: *Decompress = @alignCast(@ptrCast(context)); + const out = try bw.writableSliceGreedy(1); + const in = self.get(limit.minInt(out.len)) catch |err| switch (err) { + error.EndOfStream => return error.EndOfStream, + error.ReadFailed => return error.ReadFailed, + else => |e| { + self.read_err = e; + return error.ReadFailed; + }, + }; + if (in.len == 0) return error.EndOfStream; + @memcpy(out[0..in.len], in); + bw.advance(in.len); + return in.len; +} + +fn readerReadVec(context: ?*anyopaque, data: []const []u8) std.io.Reader.Error!usize { + const self: *Decompress = @alignCast(@ptrCast(context)); + return readVec(self, data) catch |err| switch (err) { + error.EndOfStream => return error.EndOfStream, + error.ReadFailed => return error.ReadFailed, + else => |e| { + self.read_err = e; + return error.ReadFailed; + }, + }; +} + +fn readerDiscard(context: ?*anyopaque, limit: std.io.Reader.Limit) std.io.Reader.Error!usize { + _ = context; + _ = limit; + @panic("TODO"); +} + +pub fn readVec(self: *Decompress, data: []const []u8) Error!usize { + for (data) |out| { + if (out.len == 0) continue; + const in = try self.get(out.len); + @memcpy(out[0..in.len], in); + if (in.len == 0) return error.EndOfStream; + return in.len; + } + return 0; +} + +pub fn reader(self: *Decompress) std.io.Reader { + return .{ + .context = self, + .vtable = &.{ + .read = readerRead, + .readVec = readerReadVec, + .discard = readerDiscard, + }, + }; +} + +pub fn readable(self: *Decompress, buffer: []u8) std.io.BufferedReader { + return reader(self).buffered(buffer); +} + +pub const Symbol = packed struct { + pub const Kind = enum(u2) { + literal, + end_of_block, + match, + }; + + symbol: u8 = 0, // symbol from alphabet + code_bits: u4 = 0, // number of bits in code 0-15 + kind: Kind = .literal, + + code: u16 = 0, // huffman code of the symbol + next: u16 = 0, // pointer to the next symbol in linked list + // it is safe to use 0 as null pointer, when sorted 0 has shortest code and fits into lookup + + // Sorting less than function. + pub fn asc(_: void, a: Symbol, b: Symbol) bool { + if (a.code_bits == b.code_bits) { + if (a.kind == b.kind) { + return a.symbol < b.symbol; + } + return @intFromEnum(a.kind) < @intFromEnum(b.kind); + } + return a.code_bits < b.code_bits; + } +}; + +pub const LiteralDecoder = HuffmanDecoder(286, 15, 9); +pub const DistanceDecoder = HuffmanDecoder(30, 15, 9); +pub const CodegenDecoder = HuffmanDecoder(19, 7, 7); + +/// Creates huffman tree codes from list of code lengths (in `build`). +/// +/// `find` then finds symbol for code bits. Code can be any length between 1 and +/// 15 bits. When calling `find` we don't know how many bits will be used to +/// find symbol. When symbol is returned it has code_bits field which defines +/// how much we should advance in bit stream. +/// +/// Lookup table is used to map 15 bit int to symbol. Same symbol is written +/// many times in this table; 32K places for 286 (at most) symbols. +/// Small lookup table is optimization for faster search. +/// It is variation of the algorithm explained in [zlib](https://github.com/madler/zlib/blob/643e17b7498d12ab8d15565662880579692f769d/doc/algorithm.txt#L92) +/// with difference that we here use statically allocated arrays. +/// +fn HuffmanDecoder( + comptime alphabet_size: u16, + comptime max_code_bits: u4, + comptime lookup_bits: u4, +) type { + const lookup_shift = max_code_bits - lookup_bits; + + return struct { + // all symbols in alaphabet, sorted by code_len, symbol + symbols: [alphabet_size]Symbol = undefined, + // lookup table code -> symbol + lookup: [1 << lookup_bits]Symbol = undefined, + + const Self = @This(); + + /// Generates symbols and lookup tables from list of code lens for each symbol. + pub fn generate(self: *Self, lens: []const u4) !void { + try checkCompleteness(lens); + + // init alphabet with code_bits + for (self.symbols, 0..) |_, i| { + const cb: u4 = if (i < lens.len) lens[i] else 0; + self.symbols[i] = if (i < 256) + .{ .kind = .literal, .symbol = @intCast(i), .code_bits = cb } + else if (i == 256) + .{ .kind = .end_of_block, .symbol = 0xff, .code_bits = cb } + else + .{ .kind = .match, .symbol = @intCast(i - 257), .code_bits = cb }; + } + std.sort.heap(Symbol, &self.symbols, {}, Symbol.asc); + + // reset lookup table + for (0..self.lookup.len) |i| { + self.lookup[i] = .{}; + } + + // assign code to symbols + // reference: https://youtu.be/9_YEGLe33NA?list=PLU4IQLU9e_OrY8oASHx0u3IXAL9TOdidm&t=2639 + var code: u16 = 0; + var idx: u16 = 0; + for (&self.symbols, 0..) |*sym, pos| { + if (sym.code_bits == 0) continue; // skip unused + sym.code = code; + + const next_code = code + (@as(u16, 1) << (max_code_bits - sym.code_bits)); + const next_idx = next_code >> lookup_shift; + + if (next_idx > self.lookup.len or idx >= self.lookup.len) break; + if (sym.code_bits <= lookup_bits) { + // fill small lookup table + for (idx..next_idx) |j| + self.lookup[j] = sym.*; + } else { + // insert into linked table starting at root + const root = &self.lookup[idx]; + const root_next = root.next; + root.next = @intCast(pos); + sym.next = root_next; + } + + idx = next_idx; + code = next_code; + } + } + + /// Given the list of code lengths check that it represents a canonical + /// Huffman code for n symbols. + /// + /// Reference: https://github.com/madler/zlib/blob/5c42a230b7b468dff011f444161c0145b5efae59/contrib/puff/puff.c#L340 + fn checkCompleteness(lens: []const u4) !void { + if (alphabet_size == 286) + if (lens[256] == 0) return error.MissingEndOfBlockCode; + + var count = [_]u16{0} ** (@as(usize, max_code_bits) + 1); + var max: usize = 0; + for (lens) |n| { + if (n == 0) continue; + if (n > max) max = n; + count[n] += 1; + } + if (max == 0) // empty tree + return; + + // check for an over-subscribed or incomplete set of lengths + var left: usize = 1; // one possible code of zero length + for (1..count.len) |len| { + left <<= 1; // one more bit, double codes left + if (count[len] > left) + return error.OversubscribedHuffmanTree; + left -= count[len]; // deduct count from possible codes + } + if (left > 0) { // left > 0 means incomplete + // incomplete code ok only for single length 1 code + if (max_code_bits > 7 and max == count[0] + count[1]) return; + return error.IncompleteHuffmanTree; + } + } + + /// Finds symbol for lookup table code. + pub fn find(self: *Self, code: u16) !Symbol { + // try to find in lookup table + const idx = code >> lookup_shift; + const sym = self.lookup[idx]; + if (sym.code_bits != 0) return sym; + // if not use linked list of symbols with same prefix + return self.findLinked(code, sym.next); + } + + inline fn findLinked(self: *Self, code: u16, start: u16) !Symbol { + var pos = start; + while (pos > 0) { + const sym = self.symbols[pos]; + const shift = max_code_bits - sym.code_bits; + // compare code_bits number of upper bits + if ((code ^ sym.code) >> shift == 0) return sym; + pos = sym.next; + } + return error.InvalidCode; + } + }; +} + +test "init/find" { + // example data from: https://youtu.be/SJPvNi4HrWQ?t=8423 + const code_lens = [_]u4{ 4, 3, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 2 }; + var h: CodegenDecoder = .{}; + try h.generate(&code_lens); + + const expected = [_]struct { + sym: Symbol, + code: u16, + }{ + .{ + .code = 0b00_00000, + .sym = .{ .symbol = 3, .code_bits = 2 }, + }, + .{ + .code = 0b01_00000, + .sym = .{ .symbol = 18, .code_bits = 2 }, + }, + .{ + .code = 0b100_0000, + .sym = .{ .symbol = 1, .code_bits = 3 }, + }, + .{ + .code = 0b101_0000, + .sym = .{ .symbol = 4, .code_bits = 3 }, + }, + .{ + .code = 0b110_0000, + .sym = .{ .symbol = 17, .code_bits = 3 }, + }, + .{ + .code = 0b1110_000, + .sym = .{ .symbol = 0, .code_bits = 4 }, + }, + .{ + .code = 0b1111_000, + .sym = .{ .symbol = 16, .code_bits = 4 }, + }, + }; + + // unused symbols + for (0..12) |i| { + try testing.expectEqual(0, h.symbols[i].code_bits); + } + // used, from index 12 + for (expected, 12..) |e, i| { + try testing.expectEqual(e.sym.symbol, h.symbols[i].symbol); + try testing.expectEqual(e.sym.code_bits, h.symbols[i].code_bits); + const sym_from_code = try h.find(e.code); + try testing.expectEqual(e.sym.symbol, sym_from_code.symbol); + } + + // All possible codes for each symbol. + // Lookup table has 126 elements, to cover all possible 7 bit codes. + for (0b0000_000..0b0100_000) |c| // 0..32 (32) + try testing.expectEqual(3, (try h.find(@intCast(c))).symbol); + + for (0b0100_000..0b1000_000) |c| // 32..64 (32) + try testing.expectEqual(18, (try h.find(@intCast(c))).symbol); + + for (0b1000_000..0b1010_000) |c| // 64..80 (16) + try testing.expectEqual(1, (try h.find(@intCast(c))).symbol); + + for (0b1010_000..0b1100_000) |c| // 80..96 (16) + try testing.expectEqual(4, (try h.find(@intCast(c))).symbol); + + for (0b1100_000..0b1110_000) |c| // 96..112 (16) + try testing.expectEqual(17, (try h.find(@intCast(c))).symbol); + + for (0b1110_000..0b1111_000) |c| // 112..120 (8) + try testing.expectEqual(0, (try h.find(@intCast(c))).symbol); + + for (0b1111_000..0b1_0000_000) |c| // 120...128 (8) + try testing.expectEqual(16, (try h.find(@intCast(c))).symbol); +} + +test "encode/decode literals" { + const LiteralEncoder = std.compress.flate.Compress.LiteralEncoder; + + for (1..286) |j| { // for all different number of codes + var enc: LiteralEncoder = .{}; + // create frequencies + var freq = [_]u16{0} ** 286; + freq[256] = 1; // ensure we have end of block code + for (&freq, 1..) |*f, i| { + if (i % j == 0) + f.* = @intCast(i); + } + + // encoder from frequencies + enc.generate(&freq, 15); + + // get code_lens from encoder + var code_lens = [_]u4{0} ** 286; + for (code_lens, 0..) |_, i| { + code_lens[i] = @intCast(enc.codes[i].len); + } + // generate decoder from code lens + var dec: LiteralDecoder = .{}; + try dec.generate(&code_lens); + + // expect decoder code to match original encoder code + for (dec.symbols) |s| { + if (s.code_bits == 0) continue; + const c_code: u16 = @bitReverse(@as(u15, @intCast(s.code))); + const symbol: u16 = switch (s.kind) { + .literal => s.symbol, + .end_of_block => 256, + .match => @as(u16, s.symbol) + 257, + }; + + const c = enc.codes[symbol]; + try testing.expect(c.code == c_code); + } + + // find each symbol by code + for (enc.codes) |c| { + if (c.len == 0) continue; + + const s_code: u15 = @bitReverse(@as(u15, @intCast(c.code))); + const s = try dec.find(s_code); + try testing.expect(s.code == s_code); + try testing.expect(s.code_bits == c.len); + } + } +} + +test "decompress" { + const cases = [_]struct { + in: []const u8, + out: []const u8, + }{ + // non compressed block (type 0) + .{ + .in = &[_]u8{ + 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen + 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data + }, + .out = "Hello world\n", + }, + // fixed code block (type 1) + .{ + .in = &[_]u8{ + 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1 + 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00, + }, + .out = "Hello world\n", + }, + // dynamic block (type 2) + .{ + .in = &[_]u8{ + 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2 + 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38, + 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01, + }, + .out = "ABCDEABCD ABCDEABCD", + }, + }; + for (cases) |c| { + var fb = std.io.fixedBufferStream(c.in); + var al = std.ArrayList(u8).init(testing.allocator); + defer al.deinit(); + + try decompress(.raw, fb.reader(), al.writer()); + try testing.expectEqualStrings(c.out, al.items); + } +} + +test "gzip decompress" { + const cases = [_]struct { + in: []const u8, + out: []const u8, + }{ + // non compressed block (type 0) + .{ + .in = &[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes) + 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen + 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data + 0xd5, 0xe0, 0x39, 0xb7, // gzip footer: checksum + 0x0c, 0x00, 0x00, 0x00, // gzip footer: size + }, + .out = "Hello world\n", + }, + // fixed code block (type 1) + .{ + .in = &[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x03, // gzip header (10 bytes) + 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1 + 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00, + 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00, // gzip footer (chksum, len) + }, + .out = "Hello world\n", + }, + // dynamic block (type 2) + .{ + .in = &[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes) + 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2 + 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38, + 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01, + 0x17, 0x1c, 0x39, 0xb4, 0x13, 0x00, 0x00, 0x00, // gzip footer (chksum, len) + }, + .out = "ABCDEABCD ABCDEABCD", + }, + // gzip header with name + .{ + .in = &[_]u8{ + 0x1f, 0x8b, 0x08, 0x08, 0xe5, 0x70, 0xb1, 0x65, 0x00, 0x03, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x2e, + 0x74, 0x78, 0x74, 0x00, 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0xe1, + 0x02, 0x00, 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00, + }, + .out = "Hello world\n", + }, + }; + for (cases) |c| { + var fb = std.io.fixedBufferStream(c.in); + var al = std.ArrayList(u8).init(testing.allocator); + defer al.deinit(); + + try decompress(.gzip, fb.reader(), al.writer()); + try testing.expectEqualStrings(c.out, al.items); + } +} + +test "zlib decompress" { + const cases = [_]struct { + in: []const u8, + out: []const u8, + }{ + // non compressed block (type 0) + .{ + .in = &[_]u8{ + 0x78, 0b10_0_11100, // zlib header (2 bytes) + 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen + 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data + 0x1c, 0xf2, 0x04, 0x47, // zlib footer: checksum + }, + .out = "Hello world\n", + }, + }; + for (cases) |c| { + var fb = std.io.fixedBufferStream(c.in); + var al = std.ArrayList(u8).init(testing.allocator); + defer al.deinit(); + + try decompress(.zlib, fb.reader(), al.writer()); + try testing.expectEqualStrings(c.out, al.items); + } +} + +test "fuzzing tests" { + const cases = [_]struct { + input: []const u8, + out: []const u8 = "", + err: ?anyerror = null, + }{ + .{ .input = "deflate-stream", .out = @embedFile("testdata/fuzz/deflate-stream.expect") }, // 0 + .{ .input = "empty-distance-alphabet01" }, + .{ .input = "empty-distance-alphabet02" }, + .{ .input = "end-of-stream", .err = error.EndOfStream }, + .{ .input = "invalid-distance", .err = error.InvalidMatch }, + .{ .input = "invalid-tree01", .err = error.IncompleteHuffmanTree }, // 5 + .{ .input = "invalid-tree02", .err = error.IncompleteHuffmanTree }, + .{ .input = "invalid-tree03", .err = error.IncompleteHuffmanTree }, + .{ .input = "lengths-overflow", .err = error.InvalidDynamicBlockHeader }, + .{ .input = "out-of-codes", .err = error.InvalidCode }, + .{ .input = "puff01", .err = error.WrongStoredBlockNlen }, // 10 + .{ .input = "puff02", .err = error.EndOfStream }, + .{ .input = "puff03", .out = &[_]u8{0xa} }, + .{ .input = "puff04", .err = error.InvalidCode }, + .{ .input = "puff05", .err = error.EndOfStream }, + .{ .input = "puff06", .err = error.EndOfStream }, + .{ .input = "puff08", .err = error.InvalidCode }, + .{ .input = "puff09", .out = "P" }, + .{ .input = "puff10", .err = error.InvalidCode }, + .{ .input = "puff11", .err = error.InvalidMatch }, + .{ .input = "puff12", .err = error.InvalidDynamicBlockHeader }, // 20 + .{ .input = "puff13", .err = error.IncompleteHuffmanTree }, + .{ .input = "puff14", .err = error.EndOfStream }, + .{ .input = "puff15", .err = error.IncompleteHuffmanTree }, + .{ .input = "puff16", .err = error.InvalidDynamicBlockHeader }, + .{ .input = "puff17", .err = error.MissingEndOfBlockCode }, // 25 + .{ .input = "fuzz1", .err = error.InvalidDynamicBlockHeader }, + .{ .input = "fuzz2", .err = error.InvalidDynamicBlockHeader }, + .{ .input = "fuzz3", .err = error.InvalidMatch }, + .{ .input = "fuzz4", .err = error.OversubscribedHuffmanTree }, + .{ .input = "puff18", .err = error.OversubscribedHuffmanTree }, // 30 + .{ .input = "puff19", .err = error.OversubscribedHuffmanTree }, + .{ .input = "puff20", .err = error.OversubscribedHuffmanTree }, + .{ .input = "puff21", .err = error.OversubscribedHuffmanTree }, + .{ .input = "puff22", .err = error.OversubscribedHuffmanTree }, + .{ .input = "puff23", .err = error.OversubscribedHuffmanTree }, // 35 + .{ .input = "puff24", .err = error.IncompleteHuffmanTree }, + .{ .input = "puff25", .err = error.OversubscribedHuffmanTree }, + .{ .input = "puff26", .err = error.InvalidDynamicBlockHeader }, + .{ .input = "puff27", .err = error.InvalidDynamicBlockHeader }, + }; + + inline for (cases, 0..) |c, case_no| { + var in = std.io.fixedBufferStream(@embedFile("testdata/fuzz/" ++ c.input ++ ".input")); + var out = std.ArrayList(u8).init(testing.allocator); + defer out.deinit(); + errdefer std.debug.print("test case failed {}\n", .{case_no}); + + if (c.err) |expected_err| { + try testing.expectError(expected_err, decompress(.raw, in.reader(), out.writer())); + } else { + try decompress(.raw, in.reader(), out.writer()); + try testing.expectEqualStrings(c.out, out.items); + } + } +} + +test "bug 18966" { + const input = @embedFile("testdata/fuzz/bug_18966.input"); + const expect = @embedFile("testdata/fuzz/bug_18966.expect"); + + var in = std.io.fixedBufferStream(input); + var out = std.ArrayList(u8).init(testing.allocator); + defer out.deinit(); + + try decompress(.gzip, in.reader(), out.writer()); + try testing.expectEqualStrings(expect, out.items); +} + +test "bug 19895" { + const input = &[_]u8{ + 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen + 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data + }; + var in = std.io.fixedBufferStream(input); + var decomp = Decompress.init(.raw, in.reader()); + var buf: [0]u8 = undefined; + try testing.expectEqual(0, try decomp.read(&buf)); +} diff --git a/lib/std/compress/flate/Lookup.zig b/lib/std/compress/flate/Lookup.zig index 90d0341bca..b3660e8877 100644 --- a/lib/std/compress/flate/Lookup.zig +++ b/lib/std/compress/flate/Lookup.zig @@ -5,22 +5,22 @@ const std = @import("std"); const testing = std.testing; const expect = testing.expect; -const consts = @import("consts.zig"); +const flate = @import("../flate.zig"); -const Self = @This(); +const Lookup = @This(); const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761 -const chain_len = 2 * consts.history.len; +const chain_len = 2 * flate.history.len; // Maps hash => first position -head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len, +head: [flate.lookup.len]u16 = [_]u16{0} ** flate.lookup.len, // Maps position => previous positions for the same hash value chain: [chain_len]u16 = [_]u16{0} ** (chain_len), // Calculates hash of the 4 bytes from data. // Inserts `pos` position of that hash in the lookup tables. // Returns previous location with the same hash value. -pub fn add(self: *Self, data: []const u8, pos: u16) u16 { +pub fn add(self: *Lookup, data: []const u8, pos: u16) u16 { if (data.len < 4) return 0; const h = hash(data[0..4]); return self.set(h, pos); @@ -28,11 +28,11 @@ pub fn add(self: *Self, data: []const u8, pos: u16) u16 { // Returns previous location with the same hash value given the current // position. -pub fn prev(self: *Self, pos: u16) u16 { +pub fn prev(self: *Lookup, pos: u16) u16 { return self.chain[pos]; } -fn set(self: *Self, h: u32, pos: u16) u16 { +fn set(self: *Lookup, h: u32, pos: u16) u16 { const p = self.head[h]; self.head[h] = pos; self.chain[pos] = p; @@ -40,7 +40,7 @@ fn set(self: *Self, h: u32, pos: u16) u16 { } // Slide all positions in head and chain for `n` -pub fn slide(self: *Self, n: u16) void { +pub fn slide(self: *Lookup, n: u16) void { for (&self.head) |*v| { v.* -|= n; } @@ -52,8 +52,8 @@ pub fn slide(self: *Self, n: u16) void { // Add `len` 4 bytes hashes from `data` into lookup. // Position of the first byte is `pos`. -pub fn bulkAdd(self: *Self, data: []const u8, len: u16, pos: u16) void { - if (len == 0 or data.len < consts.match.min_length) { +pub fn bulkAdd(self: *Lookup, data: []const u8, len: u16, pos: u16) void { + if (len == 0 or data.len < flate.match.min_length) { return; } var hb = @@ -80,7 +80,7 @@ fn hash(b: *const [4]u8) u32 { } fn hashu(v: u32) u32 { - return @intCast((v *% prime4) >> consts.lookup.shift); + return @intCast((v *% prime4) >> flate.lookup.shift); } test add { @@ -91,7 +91,7 @@ test add { 0x01, 0x02, 0x03, }; - var h: Self = .{}; + var h: Lookup = .{}; for (data, 0..) |_, i| { const p = h.add(data[i..], @intCast(i)); if (i >= 8 and i < 24) { @@ -101,7 +101,7 @@ test add { } } - const v = Self.hash(data[2 .. 2 + 4]); + const v = Lookup.hash(data[2 .. 2 + 4]); try expect(h.head[v] == 2 + 16); try expect(h.chain[2 + 16] == 2 + 8); try expect(h.chain[2 + 8] == 2); @@ -111,13 +111,13 @@ test bulkAdd { const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; // one by one - var h: Self = .{}; + var h: Lookup = .{}; for (data, 0..) |_, i| { _ = h.add(data[i..], @intCast(i)); } // in bulk - var bh: Self = .{}; + var bh: Lookup = .{}; bh.bulkAdd(data, data.len, 0); try testing.expectEqualSlices(u16, &h.head, &bh.head); diff --git a/lib/std/compress/flate/SlidingWindow.zig b/lib/std/compress/flate/SlidingWindow.zig deleted file mode 100644 index ece907c32f..0000000000 --- a/lib/std/compress/flate/SlidingWindow.zig +++ /dev/null @@ -1,160 +0,0 @@ -//! Used in deflate (compression), holds uncompressed data form which Tokens are -//! produces. In combination with Lookup it is used to find matches in history data. -//! -const std = @import("std"); -const consts = @import("consts.zig"); - -const expect = testing.expect; -const assert = std.debug.assert; -const testing = std.testing; - -const hist_len = consts.history.len; -const buffer_len = 2 * hist_len; -const min_lookahead = consts.match.min_length + consts.match.max_length; -const max_rp = buffer_len - min_lookahead; - -const Self = @This(); - -buffer: [buffer_len]u8 = undefined, -wp: usize = 0, // write position -rp: usize = 0, // read position -fp: isize = 0, // last flush position, tokens are build from fp..rp - -/// Returns number of bytes written, or 0 if buffer is full and need to slide. -pub fn write(self: *Self, buf: []const u8) usize { - if (self.rp >= max_rp) return 0; // need to slide - - const n = @min(buf.len, buffer_len - self.wp); - @memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]); - self.wp += n; - return n; -} - -/// Slide buffer for hist_len. -/// Drops old history, preserves between hist_len and hist_len - min_lookahead. -/// Returns number of bytes removed. -pub fn slide(self: *Self) u16 { - assert(self.rp >= max_rp and self.wp >= self.rp); - const n = self.wp - hist_len; - @memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]); - self.rp -= hist_len; - self.wp -= hist_len; - self.fp -= hist_len; - return @intCast(n); -} - -/// Data from the current position (read position). Those part of the buffer is -/// not converted to tokens yet. -fn lookahead(self: *Self) []const u8 { - assert(self.wp >= self.rp); - return self.buffer[self.rp..self.wp]; -} - -/// Returns part of the lookahead buffer. If should_flush is set no lookahead is -/// preserved otherwise preserves enough data for the longest match. Returns -/// null if there is not enough data. -pub fn activeLookahead(self: *Self, should_flush: bool) ?[]const u8 { - const min: usize = if (should_flush) 0 else min_lookahead; - const lh = self.lookahead(); - return if (lh.len > min) lh else null; -} - -/// Advances read position, shrinks lookahead. -pub fn advance(self: *Self, n: u16) void { - assert(self.wp >= self.rp + n); - self.rp += n; -} - -/// Returns writable part of the buffer, where new uncompressed data can be -/// written. -pub fn writable(self: *Self) []u8 { - return self.buffer[self.wp..]; -} - -/// Notification of what part of writable buffer is filled with data. -pub fn written(self: *Self, n: usize) void { - self.wp += n; -} - -/// Finds match length between previous and current position. -/// Used in hot path! -pub fn match(self: *Self, prev_pos: u16, curr_pos: u16, min_len: u16) u16 { - const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length); - // lookahead buffers from previous and current positions - const prev_lh = self.buffer[prev_pos..][0..max_len]; - const curr_lh = self.buffer[curr_pos..][0..max_len]; - - // If we already have match (min_len > 0), - // test the first byte above previous len a[min_len] != b[min_len] - // and then all the bytes from that position to zero. - // That is likely positions to find difference than looping from first bytes. - var i: usize = min_len; - if (i > 0) { - if (max_len <= i) return 0; - while (true) { - if (prev_lh[i] != curr_lh[i]) return 0; - if (i == 0) break; - i -= 1; - } - i = min_len; - } - while (i < max_len) : (i += 1) - if (prev_lh[i] != curr_lh[i]) break; - return if (i >= consts.match.min_length) @intCast(i) else 0; -} - -/// Current position of non-compressed data. Data before rp are already converted -/// to tokens. -pub fn pos(self: *Self) u16 { - return @intCast(self.rp); -} - -/// Notification that token list is cleared. -pub fn flush(self: *Self) void { - self.fp = @intCast(self.rp); -} - -/// Part of the buffer since last flush or null if there was slide in between (so -/// fp becomes negative). -pub fn tokensBuffer(self: *Self) ?[]const u8 { - assert(self.fp <= self.rp); - if (self.fp < 0) return null; - return self.buffer[@intCast(self.fp)..self.rp]; -} - -test match { - const data = "Blah blah blah blah blah!"; - var win: Self = .{}; - try expect(win.write(data) == data.len); - try expect(win.wp == data.len); - try expect(win.rp == 0); - - // length between l symbols - try expect(win.match(1, 6, 0) == 18); - try expect(win.match(1, 11, 0) == 13); - try expect(win.match(1, 16, 0) == 8); - try expect(win.match(1, 21, 0) == 0); - - // position 15 = "blah blah!" - // position 20 = "blah!" - try expect(win.match(15, 20, 0) == 4); - try expect(win.match(15, 20, 3) == 4); - try expect(win.match(15, 20, 4) == 0); -} - -test slide { - var win: Self = .{}; - win.wp = Self.buffer_len - 11; - win.rp = Self.buffer_len - 111; - win.buffer[win.rp] = 0xab; - try expect(win.lookahead().len == 100); - try expect(win.tokensBuffer().?.len == win.rp); - - const n = win.slide(); - try expect(n == 32757); - try expect(win.buffer[win.rp] == 0xab); - try expect(win.rp == Self.hist_len - 111); - try expect(win.wp == Self.hist_len - 11); - try expect(win.lookahead().len == 100); - try expect(win.tokensBuffer() == null); -} diff --git a/lib/std/compress/flate/Token.zig b/lib/std/compress/flate/Token.zig index a9641f6adc..293a786cef 100644 --- a/lib/std/compress/flate/Token.zig +++ b/lib/std/compress/flate/Token.zig @@ -6,7 +6,7 @@ const std = @import("std"); const assert = std.debug.assert; const print = std.debug.print; const expect = std.testing.expect; -const consts = @import("consts.zig").match; +const match = std.compress.flate.match; const Token = @This(); @@ -26,11 +26,11 @@ pub fn literal(t: Token) u8 { } pub fn distance(t: Token) u16 { - return @as(u16, t.dist) + consts.min_distance; + return @as(u16, t.dist) + match.min_distance; } pub fn length(t: Token) u16 { - return @as(u16, t.len_lit) + consts.base_length; + return @as(u16, t.len_lit) + match.base_length; } pub fn initLiteral(lit: u8) Token { @@ -40,12 +40,12 @@ pub fn initLiteral(lit: u8) Token { // distance range 1 - 32768, stored in dist as 0 - 32767 (u15) // length range 3 - 258, stored in len_lit as 0 - 255 (u8) pub fn initMatch(dist: u16, len: u16) Token { - assert(len >= consts.min_length and len <= consts.max_length); - assert(dist >= consts.min_distance and dist <= consts.max_distance); + assert(len >= match.min_length and len <= match.max_length); + assert(dist >= match.min_distance and dist <= match.max_distance); return .{ .kind = .match, - .dist = @intCast(dist - consts.min_distance), - .len_lit = @intCast(len - consts.base_length), + .dist = @intCast(dist - match.min_distance), + .len_lit = @intCast(len - match.base_length), }; } diff --git a/lib/std/compress/flate/consts.zig b/lib/std/compress/flate/consts.zig deleted file mode 100644 index b17083461b..0000000000 --- a/lib/std/compress/flate/consts.zig +++ /dev/null @@ -1,49 +0,0 @@ -pub const deflate = struct { - // Number of tokens to accumulate in deflate before starting block encoding. - // - // In zlib this depends on memlevel: 6 + memlevel, where default memlevel is - // 8 and max 9 that gives 14 or 15 bits. - pub const tokens = 1 << 15; -}; - -pub const match = struct { - pub const base_length = 3; // smallest match length per the RFC section 3.2.5 - pub const min_length = 4; // min length used in this algorithm - pub const max_length = 258; - - pub const min_distance = 1; - pub const max_distance = 32768; -}; - -pub const history = struct { - pub const len = match.max_distance; -}; - -pub const lookup = struct { - pub const bits = 15; - pub const len = 1 << bits; - pub const shift = 32 - bits; -}; - -pub const huffman = struct { - // The odd order in which the codegen code sizes are written. - pub const codegen_order = [_]u32{ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 }; - // The number of codegen codes. - pub const codegen_code_count = 19; - - // The largest distance code. - pub const distance_code_count = 30; - - // Maximum number of literals. - pub const max_num_lit = 286; - - // Max number of frequencies used for a Huffman Code - // Possible lengths are codegen_code_count (19), distance_code_count (30) and max_num_lit (286). - // The largest of these is max_num_lit. - pub const max_num_frequencies = max_num_lit; - - // Biggest block size for uncompressed block. - pub const max_store_block_size = 65535; - // The special code used to mark the end of a block. - pub const end_block_marker = 256; -}; diff --git a/lib/std/compress/flate/container.zig b/lib/std/compress/flate/container.zig deleted file mode 100644 index fe6dec446d..0000000000 --- a/lib/std/compress/flate/container.zig +++ /dev/null @@ -1,208 +0,0 @@ -//! Container of the deflate bit stream body. Container adds header before -//! deflate bit stream and footer after. It can bi gzip, zlib or raw (no header, -//! no footer, raw bit stream). -//! -//! Zlib format is defined in rfc 1950. Header has 2 bytes and footer 4 bytes -//! addler 32 checksum. -//! -//! Gzip format is defined in rfc 1952. Header has 10+ bytes and footer 4 bytes -//! crc32 checksum and 4 bytes of uncompressed data length. -//! -//! -//! rfc 1950: https://datatracker.ietf.org/doc/html/rfc1950#page-4 -//! rfc 1952: https://datatracker.ietf.org/doc/html/rfc1952#page-5 -//! - -const std = @import("std"); - -pub const Container = enum { - raw, // no header or footer - gzip, // gzip header and footer - zlib, // zlib header and footer - - pub fn size(w: Container) usize { - return headerSize(w) + footerSize(w); - } - - pub fn headerSize(w: Container) usize { - return switch (w) { - .gzip => 10, - .zlib => 2, - .raw => 0, - }; - } - - pub fn footerSize(w: Container) usize { - return switch (w) { - .gzip => 8, - .zlib => 4, - .raw => 0, - }; - } - - pub const list = [_]Container{ .raw, .gzip, .zlib }; - - pub const Error = error{ - BadGzipHeader, - BadZlibHeader, - WrongGzipChecksum, - WrongGzipSize, - WrongZlibChecksum, - }; - - pub fn writeHeader(comptime wrap: Container, writer: anytype) !void { - switch (wrap) { - .gzip => { - // GZIP 10 byte header (https://datatracker.ietf.org/doc/html/rfc1952#page-5): - // - ID1 (IDentification 1), always 0x1f - // - ID2 (IDentification 2), always 0x8b - // - CM (Compression Method), always 8 = deflate - // - FLG (Flags), all set to 0 - // - 4 bytes, MTIME (Modification time), not used, all set to zero - // - XFL (eXtra FLags), all set to zero - // - OS (Operating System), 03 = Unix - const gzipHeader = [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 }; - try writer.writeAll(&gzipHeader); - }, - .zlib => { - // ZLIB has a two-byte header (https://datatracker.ietf.org/doc/html/rfc1950#page-4): - // 1st byte: - // - First four bits is the CINFO (compression info), which is 7 for the default deflate window size. - // - The next four bits is the CM (compression method), which is 8 for deflate. - // 2nd byte: - // - Two bits is the FLEVEL (compression level). Values are: 0=fastest, 1=fast, 2=default, 3=best. - // - The next bit, FDICT, is set if a dictionary is given. - // - The final five FCHECK bits form a mod-31 checksum. - // - // CINFO = 7, CM = 8, FLEVEL = 0b10, FDICT = 0, FCHECK = 0b11100 - const zlibHeader = [_]u8{ 0x78, 0b10_0_11100 }; - try writer.writeAll(&zlibHeader); - }, - .raw => {}, - } - } - - pub fn writeFooter(comptime wrap: Container, hasher: *Hasher(wrap), writer: anytype) !void { - var bits: [4]u8 = undefined; - switch (wrap) { - .gzip => { - // GZIP 8 bytes footer - // - 4 bytes, CRC32 (CRC-32) - // - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32 - std.mem.writeInt(u32, &bits, hasher.chksum(), .little); - try writer.writeAll(&bits); - - std.mem.writeInt(u32, &bits, hasher.bytesRead(), .little); - try writer.writeAll(&bits); - }, - .zlib => { - // ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952). - // 4 bytes of ADLER32 (Adler-32 checksum) - // Checksum value of the uncompressed data (excluding any - // dictionary data) computed according to Adler-32 - // algorithm. - std.mem.writeInt(u32, &bits, hasher.chksum(), .big); - try writer.writeAll(&bits); - }, - .raw => {}, - } - } - - pub fn parseHeader(comptime wrap: Container, reader: anytype) !void { - switch (wrap) { - .gzip => try parseGzipHeader(reader), - .zlib => try parseZlibHeader(reader), - .raw => {}, - } - } - - fn parseGzipHeader(reader: anytype) !void { - const magic1 = try reader.read(u8); - const magic2 = try reader.read(u8); - const method = try reader.read(u8); - const flags = try reader.read(u8); - try reader.skipBytes(6); // mtime(4), xflags, os - if (magic1 != 0x1f or magic2 != 0x8b or method != 0x08) - return error.BadGzipHeader; - // Flags description: https://www.rfc-editor.org/rfc/rfc1952.html#page-5 - if (flags != 0) { - if (flags & 0b0000_0100 != 0) { // FEXTRA - const extra_len = try reader.read(u16); - try reader.skipBytes(extra_len); - } - if (flags & 0b0000_1000 != 0) { // FNAME - try reader.skipStringZ(); - } - if (flags & 0b0001_0000 != 0) { // FCOMMENT - try reader.skipStringZ(); - } - if (flags & 0b0000_0010 != 0) { // FHCRC - try reader.skipBytes(2); - } - } - } - - fn parseZlibHeader(reader: anytype) !void { - const cm = try reader.read(u4); - const cinfo = try reader.read(u4); - _ = try reader.read(u8); - if (cm != 8 or cinfo > 7) { - return error.BadZlibHeader; - } - } - - pub fn parseFooter(comptime wrap: Container, hasher: *Hasher(wrap), reader: anytype) !void { - switch (wrap) { - .gzip => { - try reader.fill(0); - if (try reader.read(u32) != hasher.chksum()) return error.WrongGzipChecksum; - if (try reader.read(u32) != hasher.bytesRead()) return error.WrongGzipSize; - }, - .zlib => { - const chksum: u32 = @byteSwap(hasher.chksum()); - if (try reader.read(u32) != chksum) return error.WrongZlibChecksum; - }, - .raw => {}, - } - } - - pub fn Hasher(comptime wrap: Container) type { - const HasherType = switch (wrap) { - .gzip => std.hash.Crc32, - .zlib => std.hash.Adler32, - .raw => struct { - pub fn init() @This() { - return .{}; - } - }, - }; - - return struct { - hasher: HasherType = HasherType.init(), - bytes: usize = 0, - - const Self = @This(); - - pub fn update(self: *Self, buf: []const u8) void { - switch (wrap) { - .raw => {}, - else => { - self.hasher.update(buf); - self.bytes += buf.len; - }, - } - } - - pub fn chksum(self: *Self) u32 { - switch (wrap) { - .raw => return 0, - else => return self.hasher.final(), - } - } - - pub fn bytesRead(self: *Self) u32 { - return @truncate(self.bytes); - } - }; - } -}; diff --git a/lib/std/compress/flate/deflate.zig b/lib/std/compress/flate/deflate.zig deleted file mode 100644 index e01d9ad647..0000000000 --- a/lib/std/compress/flate/deflate.zig +++ /dev/null @@ -1,740 +0,0 @@ -const std = @import("std"); -const io = std.io; -const assert = std.debug.assert; -const testing = std.testing; -const expect = testing.expect; -const print = std.debug.print; - -const Token = @import("Token.zig"); -const consts = @import("consts.zig"); -const BlockWriter = @import("BlockWriter.zig"); -const Container = @import("container.zig").Container; -const SlidingWindow = @import("SlidingWindow.zig"); -const Lookup = @import("Lookup.zig"); - -pub const Options = struct { - level: Level = .default, -}; - -/// Trades between speed and compression size. -/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43) -/// levels 1-3 are using different algorithm to perform faster but with less -/// compression. That is not implemented here. -pub const Level = enum(u4) { - // zig fmt: off - fast = 0xb, level_4 = 4, - level_5 = 5, - default = 0xc, level_6 = 6, - level_7 = 7, - level_8 = 8, - best = 0xd, level_9 = 9, - // zig fmt: on -}; - -/// Algorithm knobs for each level. -const LevelArgs = struct { - good: u16, // Do less lookups if we already have match of this length. - nice: u16, // Stop looking for better match if we found match with at least this length. - lazy: u16, // Don't do lazy match find if got match with at least this length. - chain: u16, // How many lookups for previous match to perform. - - pub fn get(level: Level) LevelArgs { - // zig fmt: off - return switch (level) { - .fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 }, - .level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 }, - .default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 }, - .level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 }, - .level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 }, - .best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 }, - }; - // zig fmt: on - } -}; - -/// Compress plain data from reader into compressed stream written to writer. -pub fn compress( - comptime container: Container, - reader: *std.io.BufferedReader, - writer: *std.io.BufferedWriter, - options: Options, -) !void { - var c = try Compressor.init(container, writer, options); - try c.compress(reader); - try c.finish(); -} - -/// Compressor type. -pub fn Compressor(comptime container: Container) type { - return Deflate(container, BlockWriter); -} - -/// Default compression algorithm. Has two steps: tokenization and token -/// encoding. -/// -/// Tokenization takes uncompressed input stream and produces list of tokens. -/// Each token can be literal (byte of data) or match (backrefernce to previous -/// data with length and distance). Tokenization accumulators 32K tokens, when -/// full or `flush` is called tokens are passed to the `block_writer`. Level -/// defines how hard (how slow) it tries to find match. -/// -/// Block writer will decide which type of deflate block to write (stored, fixed, -/// dynamic) and encode tokens to the output byte stream. Client has to call -/// `finish` to write block with the final bit set. -/// -/// Container defines type of header and footer which can be gzip, zlib or raw. -/// They all share same deflate body. Raw has no header or footer just deflate -/// body. -/// -/// Compression algorithm explained in rfc-1951 (slightly edited for this case): -/// -/// The compressor uses a chained hash table `lookup` to find duplicated -/// strings, using a hash function that operates on 4-byte sequences. At any -/// given point during compression, let XYZW be the next 4 input bytes -/// (lookahead) to be examined (not necessarily all different, of course). -/// First, the compressor examines the hash chain for XYZW. If the chain is -/// empty, the compressor simply writes out X as a literal byte and advances -/// one byte in the input. If the hash chain is not empty, indicating that the -/// sequence XYZW (or, if we are unlucky, some other 4 bytes with the same -/// hash function value) has occurred recently, the compressor compares all -/// strings on the XYZW hash chain with the actual input data sequence -/// starting at the current point, and selects the longest match. -/// -/// To improve overall compression, the compressor defers the selection of -/// matches ("lazy matching"): after a match of length N has been found, the -/// compressor searches for a longer match starting at the next input byte. If -/// it finds a longer match, it truncates the previous match to a length of -/// one (thus producing a single literal byte) and then emits the longer -/// match. Otherwise, it emits the original match, and, as described above, -/// advances N bytes before continuing. -/// -/// -/// Allocates statically ~400K (192K lookup, 128K tokens, 64K window). -/// -/// Deflate function accepts BlockWriterType so we can change that in test to test -/// just tokenization part. -/// -fn Deflate(comptime container: Container, comptime WriterType: type, comptime BlockWriterType: type) type { - return struct { - lookup: Lookup = .{}, - win: SlidingWindow = .{}, - tokens: Tokens = .{}, - wrt: WriterType, - block_writer: BlockWriterType, - level: LevelArgs, - hasher: container.Hasher() = .{}, - - // Match and literal at the previous position. - // Used for lazy match finding in processWindow. - prev_match: ?Token = null, - prev_literal: ?u8 = null, - - const Self = @This(); - - pub fn init(wrt: WriterType, options: Options) !Self { - const self = Self{ - .wrt = wrt, - .block_writer = BlockWriterType.init(wrt), - .level = LevelArgs.get(options.level), - }; - try container.writeHeader(self.wrt); - return self; - } - - const FlushOption = enum { none, flush, final }; - - // Process data in window and create tokens. If token buffer is full - // flush tokens to the token writer. In the case of `flush` or `final` - // option it will process all data from the window. In the `none` case - // it will preserve some data for the next match. - fn tokenize(self: *Self, flush_opt: FlushOption) !void { - // flush - process all data from window - const should_flush = (flush_opt != .none); - - // While there is data in active lookahead buffer. - while (self.win.activeLookahead(should_flush)) |lh| { - var step: u16 = 1; // 1 in the case of literal, match length otherwise - const pos: u16 = self.win.pos(); - const literal = lh[0]; // literal at current position - const min_len: u16 = if (self.prev_match) |m| m.length() else 0; - - // Try to find match at least min_len long. - if (self.findMatch(pos, lh, min_len)) |match| { - // Found better match than previous. - try self.addPrevLiteral(); - - // Is found match length good enough? - if (match.length() >= self.level.lazy) { - // Don't try to lazy find better match, use this. - step = try self.addMatch(match); - } else { - // Store this match. - self.prev_literal = literal; - self.prev_match = match; - } - } else { - // There is no better match at current pos then it was previous. - // Write previous match or literal. - if (self.prev_match) |m| { - // Write match from previous position. - step = try self.addMatch(m) - 1; // we already advanced 1 from previous position - } else { - // No match at previous position. - // Write previous literal if any, and remember this literal. - try self.addPrevLiteral(); - self.prev_literal = literal; - } - } - // Advance window and add hashes. - self.windowAdvance(step, lh, pos); - } - - if (should_flush) { - // In the case of flushing, last few lookahead buffers were smaller then min match len. - // So only last literal can be unwritten. - assert(self.prev_match == null); - try self.addPrevLiteral(); - self.prev_literal = null; - - try self.flushTokens(flush_opt); - } - } - - fn windowAdvance(self: *Self, step: u16, lh: []const u8, pos: u16) void { - // current position is already added in findMatch - self.lookup.bulkAdd(lh[1..], step - 1, pos + 1); - self.win.advance(step); - } - - // Add previous literal (if any) to the tokens list. - fn addPrevLiteral(self: *Self) !void { - if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l)); - } - - // Add match to the tokens list, reset prev pointers. - // Returns length of the added match. - fn addMatch(self: *Self, m: Token) !u16 { - try self.addToken(m); - self.prev_literal = null; - self.prev_match = null; - return m.length(); - } - - fn addToken(self: *Self, token: Token) !void { - self.tokens.add(token); - if (self.tokens.full()) try self.flushTokens(.none); - } - - // Finds largest match in the history window with the data at current pos. - fn findMatch(self: *Self, pos: u16, lh: []const u8, min_len: u16) ?Token { - var len: u16 = min_len; - // Previous location with the same hash (same 4 bytes). - var prev_pos = self.lookup.add(lh, pos); - // Last found match. - var match: ?Token = null; - - // How much back-references to try, performance knob. - var chain: usize = self.level.chain; - if (len >= self.level.good) { - // If we've got a match that's good enough, only look in 1/4 the chain. - chain >>= 2; - } - - // Hot path loop! - while (prev_pos > 0 and chain > 0) : (chain -= 1) { - const distance = pos - prev_pos; - if (distance > consts.match.max_distance) - break; - - const new_len = self.win.match(prev_pos, pos, len); - if (new_len > len) { - match = Token.initMatch(@intCast(distance), new_len); - if (new_len >= self.level.nice) { - // The match is good enough that we don't try to find a better one. - return match; - } - len = new_len; - } - prev_pos = self.lookup.prev(prev_pos); - } - - return match; - } - - fn flushTokens(self: *Self, flush_opt: FlushOption) !void { - // Pass tokens to the token writer - try self.block_writer.write(self.tokens.tokens(), flush_opt == .final, self.win.tokensBuffer()); - // Stored block ensures byte alignment. - // It has 3 bits (final, block_type) and then padding until byte boundary. - // After that everything is aligned to the boundary in the stored block. - // Empty stored block is Ob000 + (0-7) bits of padding + 0x00 0x00 0xFF 0xFF. - // Last 4 bytes are byte aligned. - if (flush_opt == .flush) { - try self.block_writer.storedBlock("", false); - } - if (flush_opt != .none) { - // Safe to call only when byte aligned or it is OK to add - // padding bits (on last byte of the final block). - try self.block_writer.flush(); - } - // Reset internal tokens store. - self.tokens.reset(); - // Notify win that tokens are flushed. - self.win.flush(); - } - - // Slide win and if needed lookup tables. - fn slide(self: *Self) void { - const n = self.win.slide(); - self.lookup.slide(n); - } - - /// Compresses as much data as possible, stops when the reader becomes - /// empty. It will introduce some output latency (reading input without - /// producing all output) because some data are still in internal - /// buffers. - /// - /// It is up to the caller to call flush (if needed) or finish (required) - /// when is need to output any pending data or complete stream. - /// - pub fn compress(self: *Self, reader: anytype) !void { - while (true) { - // Fill window from reader - const buf = self.win.writable(); - if (buf.len == 0) { - try self.tokenize(.none); - self.slide(); - continue; - } - const n = try reader.readAll(buf); - self.hasher.update(buf[0..n]); - self.win.written(n); - // Process window - try self.tokenize(.none); - // Exit when no more data in reader - if (n < buf.len) break; - } - } - - /// Flushes internal buffers to the output writer. Outputs empty stored - /// block to sync bit stream to the byte boundary, so that the - /// decompressor can get all input data available so far. - /// - /// It is useful mainly in compressed network protocols, to ensure that - /// deflate bit stream can be used as byte stream. May degrade - /// compression so it should be used only when necessary. - /// - /// Completes the current deflate block and follows it with an empty - /// stored block that is three zero bits plus filler bits to the next - /// byte, followed by four bytes (00 00 ff ff). - /// - pub fn flush(self: *Self) !void { - try self.tokenize(.flush); - } - - /// Completes deflate bit stream by writing any pending data as deflate - /// final deflate block. HAS to be called once all data are written to - /// the compressor as a signal that next block has to have final bit - /// set. - /// - pub fn finish(self: *Self) !void { - try self.tokenize(.final); - try container.writeFooter(&self.hasher, self.wrt); - } - - /// Use another writer while preserving history. Most probably flush - /// should be called on old writer before setting new. - pub fn setWriter(self: *Self, new_writer: WriterType) void { - self.block_writer.setWriter(new_writer); - self.wrt = new_writer; - } - - // Writer interface - - pub const Writer = io.Writer(*Self, Error, write); - pub const Error = BlockWriterType.Error; - - /// Write `input` of uncompressed data. - /// See compress. - pub fn write(self: *Self, input: []const u8) !usize { - var fbs = io.fixedBufferStream(input); - try self.compress(fbs.reader()); - return input.len; - } - - pub fn writer(self: *Self) Writer { - return .{ .context = self }; - } - }; -} - -// Tokens store -const Tokens = struct { - list: [consts.deflate.tokens]Token = undefined, - pos: usize = 0, - - fn add(self: *Tokens, t: Token) void { - self.list[self.pos] = t; - self.pos += 1; - } - - fn full(self: *Tokens) bool { - return self.pos == self.list.len; - } - - fn reset(self: *Tokens) void { - self.pos = 0; - } - - fn tokens(self: *Tokens) []const Token { - return self.list[0..self.pos]; - } -}; - -/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and -/// only performs Huffman entropy encoding. Results in faster compression, much -/// less memory requirements during compression but bigger compressed sizes. -pub const huffman = struct { - pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void { - var c = try huffman.compressor(container, writer); - try c.compress(reader); - try c.finish(); - } - - pub fn Compressor(comptime container: Container, comptime WriterType: type) type { - return SimpleCompressor(.huffman, container, WriterType); - } - - pub fn compressor(comptime container: Container, writer: anytype) !huffman.Compressor(container, @TypeOf(writer)) { - return try huffman.Compressor(container, @TypeOf(writer)).init(writer); - } -}; - -/// Creates store blocks only. Data are not compressed only packed into deflate -/// store blocks. That adds 9 bytes of header for each block. Max stored block -/// size is 64K. Block is emitted when flush is called on on finish. -pub const store = struct { - pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void { - var c = try store.compressor(container, writer); - try c.compress(reader); - try c.finish(); - } - - pub fn Compressor(comptime container: Container, comptime WriterType: type) type { - return SimpleCompressor(.store, container, WriterType); - } - - pub fn compressor(comptime container: Container, writer: anytype) !store.Compressor(container, @TypeOf(writer)) { - return try store.Compressor(container, @TypeOf(writer)).init(writer); - } -}; - -const SimpleCompressorKind = enum { - huffman, - store, -}; - -fn simpleCompressor( - comptime kind: SimpleCompressorKind, - comptime container: Container, - writer: anytype, -) !SimpleCompressor(kind, container, @TypeOf(writer)) { - return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer); -} - -fn SimpleCompressor( - comptime kind: SimpleCompressorKind, - comptime container: Container, - comptime WriterType: type, -) type { - const BlockWriterType = BlockWriter(WriterType); - return struct { - buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes - wp: usize = 0, - - wrt: WriterType, - block_writer: BlockWriterType, - hasher: container.Hasher() = .{}, - - const Self = @This(); - - pub fn init(wrt: WriterType) !Self { - const self = Self{ - .wrt = wrt, - .block_writer = BlockWriterType.init(wrt), - }; - try container.writeHeader(self.wrt); - return self; - } - - pub fn flush(self: *Self) !void { - try self.flushBuffer(false); - try self.block_writer.storedBlock("", false); - try self.block_writer.flush(); - } - - pub fn finish(self: *Self) !void { - try self.flushBuffer(true); - try self.block_writer.flush(); - try container.writeFooter(&self.hasher, self.wrt); - } - - fn flushBuffer(self: *Self, final: bool) !void { - const buf = self.buffer[0..self.wp]; - switch (kind) { - .huffman => try self.block_writer.huffmanBlock(buf, final), - .store => try self.block_writer.storedBlock(buf, final), - } - self.wp = 0; - } - - // Writes all data from the input reader of uncompressed data. - // It is up to the caller to call flush or finish if there is need to - // output compressed blocks. - pub fn compress(self: *Self, reader: anytype) !void { - while (true) { - // read from rdr into buffer - const buf = self.buffer[self.wp..]; - if (buf.len == 0) { - try self.flushBuffer(false); - continue; - } - const n = try reader.readAll(buf); - self.hasher.update(buf[0..n]); - self.wp += n; - if (n < buf.len) break; // no more data in reader - } - } - - // Writer interface - - pub const Writer = io.Writer(*Self, Error, write); - pub const Error = BlockWriterType.Error; - - // Write `input` of uncompressed data. - pub fn write(self: *Self, input: []const u8) !usize { - var fbs = io.fixedBufferStream(input); - try self.compress(fbs.reader()); - return input.len; - } - - pub fn writer(self: *Self) Writer { - return .{ .context = self }; - } - }; -} - -const builtin = @import("builtin"); - -test "tokenization" { - const L = Token.initLiteral; - const M = Token.initMatch; - - const cases = [_]struct { - data: []const u8, - tokens: []const Token, - }{ - .{ - .data = "Blah blah blah blah blah!", - .tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') }, - }, - .{ - .data = "ABCDEABCD ABCDEABCD", - .tokens = &[_]Token{ - L('A'), L('B'), L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '), - L('A'), M(10, 8), - }, - }, - }; - - for (cases) |c| { - inline for (Container.list) |container| { // for each wrapping - - var cw = io.countingWriter(io.null_writer); - const cww = cw.writer(); - var df = try Deflate(container, @TypeOf(cww), TestTokenWriter).init(cww, .{}); - - _ = try df.write(c.data); - try df.flush(); - - // df.token_writer.show(); - try expect(df.block_writer.pos == c.tokens.len); // number of tokens written - try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match - - try testing.expectEqual(container.headerSize(), cw.bytes_written); - try df.finish(); - try testing.expectEqual(container.size(), cw.bytes_written); - } - } -} - -// Tests that tokens written are equal to expected token list. -const TestTokenWriter = struct { - const Self = @This(); - - pos: usize = 0, - actual: [128]Token = undefined, - - pub fn init(_: anytype) Self { - return .{}; - } - pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void { - for (tokens) |t| { - self.actual[self.pos] = t; - self.pos += 1; - } - } - - pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {} - - pub fn get(self: *Self) []Token { - return self.actual[0..self.pos]; - } - - pub fn show(self: *Self) void { - print("\n", .{}); - for (self.get()) |t| { - t.show(); - } - } - - pub fn flush(_: *Self) !void {} -}; - -test "file tokenization" { - const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 }; - const cases = [_]struct { - data: []const u8, // uncompressed content - // expected number of tokens producet in deflate tokenization - tokens_count: [levels.len]usize = .{0} ** levels.len, - }{ - .{ - .data = @embedFile("testdata/rfc1951.txt"), - .tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 }, - }, - - .{ - .data = @embedFile("testdata/block_writer/huffman-null-max.input"), - .tokens_count = .{ 257, 257, 257, 257, 257, 257 }, - }, - .{ - .data = @embedFile("testdata/block_writer/huffman-pi.input"), - .tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 }, - }, - .{ - .data = @embedFile("testdata/block_writer/huffman-text.input"), - .tokens_count = .{ 235, 234, 234, 234, 234, 234 }, - }, - .{ - .data = @embedFile("testdata/fuzz/roundtrip1.input"), - .tokens_count = .{ 333, 331, 331, 331, 331, 331 }, - }, - .{ - .data = @embedFile("testdata/fuzz/roundtrip2.input"), - .tokens_count = .{ 334, 334, 334, 334, 334, 334 }, - }, - }; - - for (cases) |case| { // for each case - const data = case.data; - - for (levels, 0..) |level, i| { // for each compression level - var original = io.fixedBufferStream(data); - - // buffer for decompressed data - var al = std.ArrayList(u8).init(testing.allocator); - defer al.deinit(); - const writer = al.writer(); - - // create compressor - const WriterType = @TypeOf(writer); - const TokenWriter = TokenDecoder(@TypeOf(writer)); - var cmp = try Deflate(.raw, WriterType, TokenWriter).init(writer, .{ .level = level }); - - // Stream uncompressed `original` data to the compressor. It will - // produce tokens list and pass that list to the TokenDecoder. This - // TokenDecoder uses CircularBuffer from inflate to convert list of - // tokens back to the uncompressed stream. - try cmp.compress(original.reader()); - try cmp.flush(); - const expected_count = case.tokens_count[i]; - const actual = cmp.block_writer.tokens_count; - if (expected_count == 0) { - print("actual token count {d}\n", .{actual}); - } else { - try testing.expectEqual(expected_count, actual); - } - - try testing.expectEqual(data.len, al.items.len); - try testing.expectEqualSlices(u8, data, al.items); - } - } -} - -fn TokenDecoder(comptime WriterType: type) type { - return struct { - const CircularBuffer = @import("CircularBuffer.zig"); - hist: CircularBuffer = .{}, - wrt: WriterType, - tokens_count: usize = 0, - - const Self = @This(); - - pub fn init(wrt: WriterType) Self { - return .{ .wrt = wrt }; - } - - pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void { - self.tokens_count += tokens.len; - for (tokens) |t| { - switch (t.kind) { - .literal => self.hist.write(t.literal()), - .match => try self.hist.writeMatch(t.length(), t.distance()), - } - if (self.hist.free() < 285) try self.flushWin(); - } - try self.flushWin(); - } - - pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {} - - fn flushWin(self: *Self) !void { - while (true) { - const buf = self.hist.read(); - if (buf.len == 0) break; - try self.wrt.writeAll(buf); - } - } - - pub fn flush(_: *Self) !void {} - }; -} - -test "store simple compressor" { - const data = "Hello world!"; - const expected = [_]u8{ - 0x1, // block type 0, final bit set - 0xc, 0x0, // len = 12 - 0xf3, 0xff, // ~len - 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', // - //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, - }; - - var fbs = std.io.fixedBufferStream(data); - var al = std.ArrayList(u8).init(testing.allocator); - defer al.deinit(); - - var cmp = try store.compressor(.raw, al.writer()); - try cmp.compress(fbs.reader()); - try cmp.finish(); - try testing.expectEqualSlices(u8, &expected, al.items); - - fbs.reset(); - try al.resize(0); - - // huffman only compresoor will also emit store block for this small sample - var hc = try huffman.compressor(.raw, al.writer()); - try hc.compress(fbs.reader()); - try hc.finish(); - try testing.expectEqualSlices(u8, &expected, al.items); -} diff --git a/lib/std/compress/flate/huffman_decoder.zig b/lib/std/compress/flate/huffman_decoder.zig deleted file mode 100644 index abff915f76..0000000000 --- a/lib/std/compress/flate/huffman_decoder.zig +++ /dev/null @@ -1,302 +0,0 @@ -const std = @import("std"); -const testing = std.testing; - -pub const Symbol = packed struct { - pub const Kind = enum(u2) { - literal, - end_of_block, - match, - }; - - symbol: u8 = 0, // symbol from alphabet - code_bits: u4 = 0, // number of bits in code 0-15 - kind: Kind = .literal, - - code: u16 = 0, // huffman code of the symbol - next: u16 = 0, // pointer to the next symbol in linked list - // it is safe to use 0 as null pointer, when sorted 0 has shortest code and fits into lookup - - // Sorting less than function. - pub fn asc(_: void, a: Symbol, b: Symbol) bool { - if (a.code_bits == b.code_bits) { - if (a.kind == b.kind) { - return a.symbol < b.symbol; - } - return @intFromEnum(a.kind) < @intFromEnum(b.kind); - } - return a.code_bits < b.code_bits; - } -}; - -pub const LiteralDecoder = HuffmanDecoder(286, 15, 9); -pub const DistanceDecoder = HuffmanDecoder(30, 15, 9); -pub const CodegenDecoder = HuffmanDecoder(19, 7, 7); - -pub const Error = error{ - InvalidCode, - OversubscribedHuffmanTree, - IncompleteHuffmanTree, - MissingEndOfBlockCode, -}; - -/// Creates huffman tree codes from list of code lengths (in `build`). -/// -/// `find` then finds symbol for code bits. Code can be any length between 1 and -/// 15 bits. When calling `find` we don't know how many bits will be used to -/// find symbol. When symbol is returned it has code_bits field which defines -/// how much we should advance in bit stream. -/// -/// Lookup table is used to map 15 bit int to symbol. Same symbol is written -/// many times in this table; 32K places for 286 (at most) symbols. -/// Small lookup table is optimization for faster search. -/// It is variation of the algorithm explained in [zlib](https://github.com/madler/zlib/blob/643e17b7498d12ab8d15565662880579692f769d/doc/algorithm.txt#L92) -/// with difference that we here use statically allocated arrays. -/// -fn HuffmanDecoder( - comptime alphabet_size: u16, - comptime max_code_bits: u4, - comptime lookup_bits: u4, -) type { - const lookup_shift = max_code_bits - lookup_bits; - - return struct { - // all symbols in alaphabet, sorted by code_len, symbol - symbols: [alphabet_size]Symbol = undefined, - // lookup table code -> symbol - lookup: [1 << lookup_bits]Symbol = undefined, - - const Self = @This(); - - /// Generates symbols and lookup tables from list of code lens for each symbol. - pub fn generate(self: *Self, lens: []const u4) !void { - try checkCompleteness(lens); - - // init alphabet with code_bits - for (self.symbols, 0..) |_, i| { - const cb: u4 = if (i < lens.len) lens[i] else 0; - self.symbols[i] = if (i < 256) - .{ .kind = .literal, .symbol = @intCast(i), .code_bits = cb } - else if (i == 256) - .{ .kind = .end_of_block, .symbol = 0xff, .code_bits = cb } - else - .{ .kind = .match, .symbol = @intCast(i - 257), .code_bits = cb }; - } - std.sort.heap(Symbol, &self.symbols, {}, Symbol.asc); - - // reset lookup table - for (0..self.lookup.len) |i| { - self.lookup[i] = .{}; - } - - // assign code to symbols - // reference: https://youtu.be/9_YEGLe33NA?list=PLU4IQLU9e_OrY8oASHx0u3IXAL9TOdidm&t=2639 - var code: u16 = 0; - var idx: u16 = 0; - for (&self.symbols, 0..) |*sym, pos| { - if (sym.code_bits == 0) continue; // skip unused - sym.code = code; - - const next_code = code + (@as(u16, 1) << (max_code_bits - sym.code_bits)); - const next_idx = next_code >> lookup_shift; - - if (next_idx > self.lookup.len or idx >= self.lookup.len) break; - if (sym.code_bits <= lookup_bits) { - // fill small lookup table - for (idx..next_idx) |j| - self.lookup[j] = sym.*; - } else { - // insert into linked table starting at root - const root = &self.lookup[idx]; - const root_next = root.next; - root.next = @intCast(pos); - sym.next = root_next; - } - - idx = next_idx; - code = next_code; - } - } - - /// Given the list of code lengths check that it represents a canonical - /// Huffman code for n symbols. - /// - /// Reference: https://github.com/madler/zlib/blob/5c42a230b7b468dff011f444161c0145b5efae59/contrib/puff/puff.c#L340 - fn checkCompleteness(lens: []const u4) !void { - if (alphabet_size == 286) - if (lens[256] == 0) return error.MissingEndOfBlockCode; - - var count = [_]u16{0} ** (@as(usize, max_code_bits) + 1); - var max: usize = 0; - for (lens) |n| { - if (n == 0) continue; - if (n > max) max = n; - count[n] += 1; - } - if (max == 0) // empty tree - return; - - // check for an over-subscribed or incomplete set of lengths - var left: usize = 1; // one possible code of zero length - for (1..count.len) |len| { - left <<= 1; // one more bit, double codes left - if (count[len] > left) - return error.OversubscribedHuffmanTree; - left -= count[len]; // deduct count from possible codes - } - if (left > 0) { // left > 0 means incomplete - // incomplete code ok only for single length 1 code - if (max_code_bits > 7 and max == count[0] + count[1]) return; - return error.IncompleteHuffmanTree; - } - } - - /// Finds symbol for lookup table code. - pub fn find(self: *Self, code: u16) !Symbol { - // try to find in lookup table - const idx = code >> lookup_shift; - const sym = self.lookup[idx]; - if (sym.code_bits != 0) return sym; - // if not use linked list of symbols with same prefix - return self.findLinked(code, sym.next); - } - - inline fn findLinked(self: *Self, code: u16, start: u16) !Symbol { - var pos = start; - while (pos > 0) { - const sym = self.symbols[pos]; - const shift = max_code_bits - sym.code_bits; - // compare code_bits number of upper bits - if ((code ^ sym.code) >> shift == 0) return sym; - pos = sym.next; - } - return error.InvalidCode; - } - }; -} - -test "init/find" { - // example data from: https://youtu.be/SJPvNi4HrWQ?t=8423 - const code_lens = [_]u4{ 4, 3, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 2 }; - var h: CodegenDecoder = .{}; - try h.generate(&code_lens); - - const expected = [_]struct { - sym: Symbol, - code: u16, - }{ - .{ - .code = 0b00_00000, - .sym = .{ .symbol = 3, .code_bits = 2 }, - }, - .{ - .code = 0b01_00000, - .sym = .{ .symbol = 18, .code_bits = 2 }, - }, - .{ - .code = 0b100_0000, - .sym = .{ .symbol = 1, .code_bits = 3 }, - }, - .{ - .code = 0b101_0000, - .sym = .{ .symbol = 4, .code_bits = 3 }, - }, - .{ - .code = 0b110_0000, - .sym = .{ .symbol = 17, .code_bits = 3 }, - }, - .{ - .code = 0b1110_000, - .sym = .{ .symbol = 0, .code_bits = 4 }, - }, - .{ - .code = 0b1111_000, - .sym = .{ .symbol = 16, .code_bits = 4 }, - }, - }; - - // unused symbols - for (0..12) |i| { - try testing.expectEqual(0, h.symbols[i].code_bits); - } - // used, from index 12 - for (expected, 12..) |e, i| { - try testing.expectEqual(e.sym.symbol, h.symbols[i].symbol); - try testing.expectEqual(e.sym.code_bits, h.symbols[i].code_bits); - const sym_from_code = try h.find(e.code); - try testing.expectEqual(e.sym.symbol, sym_from_code.symbol); - } - - // All possible codes for each symbol. - // Lookup table has 126 elements, to cover all possible 7 bit codes. - for (0b0000_000..0b0100_000) |c| // 0..32 (32) - try testing.expectEqual(3, (try h.find(@intCast(c))).symbol); - - for (0b0100_000..0b1000_000) |c| // 32..64 (32) - try testing.expectEqual(18, (try h.find(@intCast(c))).symbol); - - for (0b1000_000..0b1010_000) |c| // 64..80 (16) - try testing.expectEqual(1, (try h.find(@intCast(c))).symbol); - - for (0b1010_000..0b1100_000) |c| // 80..96 (16) - try testing.expectEqual(4, (try h.find(@intCast(c))).symbol); - - for (0b1100_000..0b1110_000) |c| // 96..112 (16) - try testing.expectEqual(17, (try h.find(@intCast(c))).symbol); - - for (0b1110_000..0b1111_000) |c| // 112..120 (8) - try testing.expectEqual(0, (try h.find(@intCast(c))).symbol); - - for (0b1111_000..0b1_0000_000) |c| // 120...128 (8) - try testing.expectEqual(16, (try h.find(@intCast(c))).symbol); -} - -test "encode/decode literals" { - const LiteralEncoder = @import("huffman_encoder.zig").LiteralEncoder; - - for (1..286) |j| { // for all different number of codes - var enc: LiteralEncoder = .{}; - // create frequencies - var freq = [_]u16{0} ** 286; - freq[256] = 1; // ensure we have end of block code - for (&freq, 1..) |*f, i| { - if (i % j == 0) - f.* = @intCast(i); - } - - // encoder from frequencies - enc.generate(&freq, 15); - - // get code_lens from encoder - var code_lens = [_]u4{0} ** 286; - for (code_lens, 0..) |_, i| { - code_lens[i] = @intCast(enc.codes[i].len); - } - // generate decoder from code lens - var dec: LiteralDecoder = .{}; - try dec.generate(&code_lens); - - // expect decoder code to match original encoder code - for (dec.symbols) |s| { - if (s.code_bits == 0) continue; - const c_code: u16 = @bitReverse(@as(u15, @intCast(s.code))); - const symbol: u16 = switch (s.kind) { - .literal => s.symbol, - .end_of_block => 256, - .match => @as(u16, s.symbol) + 257, - }; - - const c = enc.codes[symbol]; - try testing.expect(c.code == c_code); - } - - // find each symbol by code - for (enc.codes) |c| { - if (c.len == 0) continue; - - const s_code: u15 = @bitReverse(@as(u15, @intCast(c.code))); - const s = try dec.find(s_code); - try testing.expect(s.code == s_code); - try testing.expect(s.code_bits == c.len); - } - } -} diff --git a/lib/std/compress/flate/huffman_encoder.zig b/lib/std/compress/flate/huffman_encoder.zig deleted file mode 100644 index 3e92e55a63..0000000000 --- a/lib/std/compress/flate/huffman_encoder.zig +++ /dev/null @@ -1,536 +0,0 @@ -const std = @import("std"); -const assert = std.debug.assert; -const math = std.math; -const mem = std.mem; -const sort = std.sort; -const testing = std.testing; - -const consts = @import("consts.zig").huffman; - -const LiteralNode = struct { - literal: u16, - freq: u16, -}; - -// Describes the state of the constructed tree for a given depth. -const LevelInfo = struct { - // Our level. for better printing - level: u32, - - // The frequency of the last node at this level - last_freq: u32, - - // The frequency of the next character to add to this level - next_char_freq: u32, - - // The frequency of the next pair (from level below) to add to this level. - // Only valid if the "needed" value of the next lower level is 0. - next_pair_freq: u32, - - // The number of chains remaining to generate for this level before moving - // up to the next level - needed: u32, -}; - -// hcode is a huffman code with a bit code and bit length. -pub const HuffCode = struct { - code: u16 = 0, - len: u16 = 0, - - // set sets the code and length of an hcode. - fn set(self: *HuffCode, code: u16, length: u16) void { - self.len = length; - self.code = code; - } -}; - -pub fn HuffmanEncoder(comptime size: usize) type { - return struct { - codes: [size]HuffCode = undefined, - // Reusable buffer with the longest possible frequency table. - freq_cache: [consts.max_num_frequencies + 1]LiteralNode = undefined, - bit_count: [17]u32 = undefined, - lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate - lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate - - const Self = @This(); - - // Update this Huffman Code object to be the minimum code for the specified frequency count. - // - // freq An array of frequencies, in which frequency[i] gives the frequency of literal i. - // max_bits The maximum number of bits to use for any literal. - pub fn generate(self: *Self, freq: []u16, max_bits: u32) void { - var list = self.freq_cache[0 .. freq.len + 1]; - // Number of non-zero literals - var count: u32 = 0; - // Set list to be the set of all non-zero literals and their frequencies - for (freq, 0..) |f, i| { - if (f != 0) { - list[count] = LiteralNode{ .literal = @as(u16, @intCast(i)), .freq = f }; - count += 1; - } else { - list[count] = LiteralNode{ .literal = 0x00, .freq = 0 }; - self.codes[i].len = 0; - } - } - list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 }; - - list = list[0..count]; - if (count <= 2) { - // Handle the small cases here, because they are awkward for the general case code. With - // two or fewer literals, everything has bit length 1. - for (list, 0..) |node, i| { - // "list" is in order of increasing literal value. - self.codes[node.literal].set(@as(u16, @intCast(i)), 1); - } - return; - } - self.lfs = list; - mem.sort(LiteralNode, self.lfs, {}, byFreq); - - // Get the number of literals for each bit count - const bit_count = self.bitCounts(list, max_bits); - // And do the assignment - self.assignEncodingAndSize(bit_count, list); - } - - pub fn bitLength(self: *Self, freq: []u16) u32 { - var total: u32 = 0; - for (freq, 0..) |f, i| { - if (f != 0) { - total += @as(u32, @intCast(f)) * @as(u32, @intCast(self.codes[i].len)); - } - } - return total; - } - - // Return the number of literals assigned to each bit size in the Huffman encoding - // - // This method is only called when list.len >= 3 - // The cases of 0, 1, and 2 literals are handled by special case code. - // - // list: An array of the literals with non-zero frequencies - // and their associated frequencies. The array is in order of increasing - // frequency, and has as its last element a special element with frequency - // std.math.maxInt(i32) - // - // max_bits: The maximum number of bits that should be used to encode any literal. - // Must be less than 16. - // - // Returns an integer array in which array[i] indicates the number of literals - // that should be encoded in i bits. - fn bitCounts(self: *Self, list: []LiteralNode, max_bits_to_use: usize) []u32 { - var max_bits = max_bits_to_use; - const n = list.len; - const max_bits_limit = 16; - - assert(max_bits < max_bits_limit); - - // The tree can't have greater depth than n - 1, no matter what. This - // saves a little bit of work in some small cases - max_bits = @min(max_bits, n - 1); - - // Create information about each of the levels. - // A bogus "Level 0" whose sole purpose is so that - // level1.prev.needed == 0. This makes level1.next_pair_freq - // be a legitimate value that never gets chosen. - var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo); - // leaf_counts[i] counts the number of literals at the left - // of ancestors of the rightmost node at level i. - // leaf_counts[i][j] is the number of literals at the left - // of the level j ancestor. - var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32); - - { - var level = @as(u32, 1); - while (level <= max_bits) : (level += 1) { - // For every level, the first two items are the first two characters. - // We initialize the levels as if we had already figured this out. - levels[level] = LevelInfo{ - .level = level, - .last_freq = list[1].freq, - .next_char_freq = list[2].freq, - .next_pair_freq = list[0].freq + list[1].freq, - .needed = 0, - }; - leaf_counts[level][level] = 2; - if (level == 1) { - levels[level].next_pair_freq = math.maxInt(i32); - } - } - } - - // We need a total of 2*n - 2 items at top level and have already generated 2. - levels[max_bits].needed = 2 * @as(u32, @intCast(n)) - 4; - - { - var level = max_bits; - while (true) { - var l = &levels[level]; - if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) { - // We've run out of both leaves and pairs. - // End all calculations for this level. - // To make sure we never come back to this level or any lower level, - // set next_pair_freq impossibly large. - l.needed = 0; - levels[level + 1].next_pair_freq = math.maxInt(i32); - level += 1; - continue; - } - - const prev_freq = l.last_freq; - if (l.next_char_freq < l.next_pair_freq) { - // The next item on this row is a leaf node. - const next = leaf_counts[level][level] + 1; - l.last_freq = l.next_char_freq; - // Lower leaf_counts are the same of the previous node. - leaf_counts[level][level] = next; - if (next >= list.len) { - l.next_char_freq = maxNode().freq; - } else { - l.next_char_freq = list[next].freq; - } - } else { - // The next item on this row is a pair from the previous row. - // next_pair_freq isn't valid until we generate two - // more values in the level below - l.last_freq = l.next_pair_freq; - // Take leaf counts from the lower level, except counts[level] remains the same. - @memcpy(leaf_counts[level][0..level], leaf_counts[level - 1][0..level]); - levels[l.level - 1].needed = 2; - } - - l.needed -= 1; - if (l.needed == 0) { - // We've done everything we need to do for this level. - // Continue calculating one level up. Fill in next_pair_freq - // of that level with the sum of the two nodes we've just calculated on - // this level. - if (l.level == max_bits) { - // All done! - break; - } - levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq; - level += 1; - } else { - // If we stole from below, move down temporarily to replenish it. - while (levels[level - 1].needed > 0) { - level -= 1; - if (level == 0) { - break; - } - } - } - } - } - - // Somethings is wrong if at the end, the top level is null or hasn't used - // all of the leaves. - assert(leaf_counts[max_bits][max_bits] == n); - - var bit_count = self.bit_count[0 .. max_bits + 1]; - var bits: u32 = 1; - const counts = &leaf_counts[max_bits]; - { - var level = max_bits; - while (level > 0) : (level -= 1) { - // counts[level] gives the number of literals requiring at least "bits" - // bits to encode. - bit_count[bits] = counts[level] - counts[level - 1]; - bits += 1; - if (level == 0) { - break; - } - } - } - return bit_count; - } - - // Look at the leaves and assign them a bit count and an encoding as specified - // in RFC 1951 3.2.2 - fn assignEncodingAndSize(self: *Self, bit_count: []u32, list_arg: []LiteralNode) void { - var code = @as(u16, 0); - var list = list_arg; - - for (bit_count, 0..) |bits, n| { - code <<= 1; - if (n == 0 or bits == 0) { - continue; - } - // The literals list[list.len-bits] .. list[list.len-bits] - // are encoded using "bits" bits, and get the values - // code, code + 1, .... The code values are - // assigned in literal order (not frequency order). - const chunk = list[list.len - @as(u32, @intCast(bits)) ..]; - - self.lns = chunk; - mem.sort(LiteralNode, self.lns, {}, byLiteral); - - for (chunk) |node| { - self.codes[node.literal] = HuffCode{ - .code = bitReverse(u16, code, @as(u5, @intCast(n))), - .len = @as(u16, @intCast(n)), - }; - code += 1; - } - list = list[0 .. list.len - @as(u32, @intCast(bits))]; - } - } - }; -} - -fn maxNode() LiteralNode { - return LiteralNode{ - .literal = math.maxInt(u16), - .freq = math.maxInt(u16), - }; -} - -pub fn huffmanEncoder(comptime size: u32) HuffmanEncoder(size) { - return .{}; -} - -pub const LiteralEncoder = HuffmanEncoder(consts.max_num_frequencies); -pub const DistanceEncoder = HuffmanEncoder(consts.distance_code_count); -pub const CodegenEncoder = HuffmanEncoder(19); - -// Generates a HuffmanCode corresponding to the fixed literal table -pub fn fixedLiteralEncoder() LiteralEncoder { - var h: LiteralEncoder = undefined; - var ch: u16 = 0; - - while (ch < consts.max_num_frequencies) : (ch += 1) { - var bits: u16 = undefined; - var size: u16 = undefined; - switch (ch) { - 0...143 => { - // size 8, 000110000 .. 10111111 - bits = ch + 48; - size = 8; - }, - 144...255 => { - // size 9, 110010000 .. 111111111 - bits = ch + 400 - 144; - size = 9; - }, - 256...279 => { - // size 7, 0000000 .. 0010111 - bits = ch - 256; - size = 7; - }, - else => { - // size 8, 11000000 .. 11000111 - bits = ch + 192 - 280; - size = 8; - }, - } - h.codes[ch] = HuffCode{ .code = bitReverse(u16, bits, @as(u5, @intCast(size))), .len = size }; - } - return h; -} - -pub fn fixedDistanceEncoder() DistanceEncoder { - var h: DistanceEncoder = undefined; - for (h.codes, 0..) |_, ch| { - h.codes[ch] = HuffCode{ .code = bitReverse(u16, @as(u16, @intCast(ch)), 5), .len = 5 }; - } - return h; -} - -pub fn huffmanDistanceEncoder() DistanceEncoder { - var distance_freq = [1]u16{0} ** consts.distance_code_count; - distance_freq[0] = 1; - // huff_distance is a static distance encoder used for huffman only encoding. - // It can be reused since we will not be encoding distance values. - var h: DistanceEncoder = .{}; - h.generate(distance_freq[0..], 15); - return h; -} - -fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool { - _ = context; - return a.literal < b.literal; -} - -fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool { - _ = context; - if (a.freq == b.freq) { - return a.literal < b.literal; - } - return a.freq < b.freq; -} - -test "generate a Huffman code from an array of frequencies" { - var freqs: [19]u16 = [_]u16{ - 8, // 0 - 1, // 1 - 1, // 2 - 2, // 3 - 5, // 4 - 10, // 5 - 9, // 6 - 1, // 7 - 0, // 8 - 0, // 9 - 0, // 10 - 0, // 11 - 0, // 12 - 0, // 13 - 0, // 14 - 0, // 15 - 1, // 16 - 3, // 17 - 5, // 18 - }; - - var enc = huffmanEncoder(19); - enc.generate(freqs[0..], 7); - - try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..])); - - try testing.expectEqual(@as(usize, 3), enc.codes[0].len); - try testing.expectEqual(@as(usize, 6), enc.codes[1].len); - try testing.expectEqual(@as(usize, 6), enc.codes[2].len); - try testing.expectEqual(@as(usize, 5), enc.codes[3].len); - try testing.expectEqual(@as(usize, 3), enc.codes[4].len); - try testing.expectEqual(@as(usize, 2), enc.codes[5].len); - try testing.expectEqual(@as(usize, 2), enc.codes[6].len); - try testing.expectEqual(@as(usize, 6), enc.codes[7].len); - try testing.expectEqual(@as(usize, 0), enc.codes[8].len); - try testing.expectEqual(@as(usize, 0), enc.codes[9].len); - try testing.expectEqual(@as(usize, 0), enc.codes[10].len); - try testing.expectEqual(@as(usize, 0), enc.codes[11].len); - try testing.expectEqual(@as(usize, 0), enc.codes[12].len); - try testing.expectEqual(@as(usize, 0), enc.codes[13].len); - try testing.expectEqual(@as(usize, 0), enc.codes[14].len); - try testing.expectEqual(@as(usize, 0), enc.codes[15].len); - try testing.expectEqual(@as(usize, 6), enc.codes[16].len); - try testing.expectEqual(@as(usize, 5), enc.codes[17].len); - try testing.expectEqual(@as(usize, 3), enc.codes[18].len); - - try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code); - try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code); - try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code); - try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code); - try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code); - try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code); - try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code); - try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code); - try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code); - try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code); - try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code); -} - -test "generate a Huffman code for the fixed literal table specific to Deflate" { - const enc = fixedLiteralEncoder(); - for (enc.codes) |c| { - switch (c.len) { - 7 => { - const v = @bitReverse(@as(u7, @intCast(c.code))); - try testing.expect(v <= 0b0010111); - }, - 8 => { - const v = @bitReverse(@as(u8, @intCast(c.code))); - try testing.expect((v >= 0b000110000 and v <= 0b10111111) or - (v >= 0b11000000 and v <= 11000111)); - }, - 9 => { - const v = @bitReverse(@as(u9, @intCast(c.code))); - try testing.expect(v >= 0b110010000 and v <= 0b111111111); - }, - else => unreachable, - } - } -} - -test "generate a Huffman code for the 30 possible relative distances (LZ77 distances) of Deflate" { - const enc = fixedDistanceEncoder(); - for (enc.codes) |c| { - const v = @bitReverse(@as(u5, @intCast(c.code))); - try testing.expect(v <= 29); - try testing.expect(c.len == 5); - } -} - -// Reverse bit-by-bit a N-bit code. -fn bitReverse(comptime T: type, value: T, n: usize) T { - const r = @bitReverse(value); - return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).int.bits - n)); -} - -test bitReverse { - const ReverseBitsTest = struct { - in: u16, - bit_count: u5, - out: u16, - }; - - const reverse_bits_tests = [_]ReverseBitsTest{ - .{ .in = 1, .bit_count = 1, .out = 1 }, - .{ .in = 1, .bit_count = 2, .out = 2 }, - .{ .in = 1, .bit_count = 3, .out = 4 }, - .{ .in = 1, .bit_count = 4, .out = 8 }, - .{ .in = 1, .bit_count = 5, .out = 16 }, - .{ .in = 17, .bit_count = 5, .out = 17 }, - .{ .in = 257, .bit_count = 9, .out = 257 }, - .{ .in = 29, .bit_count = 5, .out = 23 }, - }; - - for (reverse_bits_tests) |h| { - const v = bitReverse(u16, h.in, h.bit_count); - try std.testing.expectEqual(h.out, v); - } -} - -test "fixedLiteralEncoder codes" { - var al = std.ArrayList(u8).init(testing.allocator); - defer al.deinit(); - var bw = std.io.bitWriter(.little, al.writer()); - - const f = fixedLiteralEncoder(); - for (f.codes) |c| { - try bw.writeBits(c.code, c.len); - } - try testing.expectEqualSlices(u8, &fixed_codes, al.items); -} - -pub const fixed_codes = [_]u8{ - 0b00001100, 0b10001100, 0b01001100, 0b11001100, 0b00101100, 0b10101100, 0b01101100, 0b11101100, - 0b00011100, 0b10011100, 0b01011100, 0b11011100, 0b00111100, 0b10111100, 0b01111100, 0b11111100, - 0b00000010, 0b10000010, 0b01000010, 0b11000010, 0b00100010, 0b10100010, 0b01100010, 0b11100010, - 0b00010010, 0b10010010, 0b01010010, 0b11010010, 0b00110010, 0b10110010, 0b01110010, 0b11110010, - 0b00001010, 0b10001010, 0b01001010, 0b11001010, 0b00101010, 0b10101010, 0b01101010, 0b11101010, - 0b00011010, 0b10011010, 0b01011010, 0b11011010, 0b00111010, 0b10111010, 0b01111010, 0b11111010, - 0b00000110, 0b10000110, 0b01000110, 0b11000110, 0b00100110, 0b10100110, 0b01100110, 0b11100110, - 0b00010110, 0b10010110, 0b01010110, 0b11010110, 0b00110110, 0b10110110, 0b01110110, 0b11110110, - 0b00001110, 0b10001110, 0b01001110, 0b11001110, 0b00101110, 0b10101110, 0b01101110, 0b11101110, - 0b00011110, 0b10011110, 0b01011110, 0b11011110, 0b00111110, 0b10111110, 0b01111110, 0b11111110, - 0b00000001, 0b10000001, 0b01000001, 0b11000001, 0b00100001, 0b10100001, 0b01100001, 0b11100001, - 0b00010001, 0b10010001, 0b01010001, 0b11010001, 0b00110001, 0b10110001, 0b01110001, 0b11110001, - 0b00001001, 0b10001001, 0b01001001, 0b11001001, 0b00101001, 0b10101001, 0b01101001, 0b11101001, - 0b00011001, 0b10011001, 0b01011001, 0b11011001, 0b00111001, 0b10111001, 0b01111001, 0b11111001, - 0b00000101, 0b10000101, 0b01000101, 0b11000101, 0b00100101, 0b10100101, 0b01100101, 0b11100101, - 0b00010101, 0b10010101, 0b01010101, 0b11010101, 0b00110101, 0b10110101, 0b01110101, 0b11110101, - 0b00001101, 0b10001101, 0b01001101, 0b11001101, 0b00101101, 0b10101101, 0b01101101, 0b11101101, - 0b00011101, 0b10011101, 0b01011101, 0b11011101, 0b00111101, 0b10111101, 0b01111101, 0b11111101, - 0b00010011, 0b00100110, 0b01001110, 0b10011010, 0b00111100, 0b01100101, 0b11101010, 0b10110100, - 0b11101001, 0b00110011, 0b01100110, 0b11001110, 0b10011010, 0b00111101, 0b01100111, 0b11101110, - 0b10111100, 0b11111001, 0b00001011, 0b00010110, 0b00101110, 0b01011010, 0b10111100, 0b01100100, - 0b11101001, 0b10110010, 0b11100101, 0b00101011, 0b01010110, 0b10101110, 0b01011010, 0b10111101, - 0b01100110, 0b11101101, 0b10111010, 0b11110101, 0b00011011, 0b00110110, 0b01101110, 0b11011010, - 0b10111100, 0b01100101, 0b11101011, 0b10110110, 0b11101101, 0b00111011, 0b01110110, 0b11101110, - 0b11011010, 0b10111101, 0b01100111, 0b11101111, 0b10111110, 0b11111101, 0b00000111, 0b00001110, - 0b00011110, 0b00111010, 0b01111100, 0b11100100, 0b11101000, 0b10110001, 0b11100011, 0b00100111, - 0b01001110, 0b10011110, 0b00111010, 0b01111101, 0b11100110, 0b11101100, 0b10111001, 0b11110011, - 0b00010111, 0b00101110, 0b01011110, 0b10111010, 0b01111100, 0b11100101, 0b11101010, 0b10110101, - 0b11101011, 0b00110111, 0b01101110, 0b11011110, 0b10111010, 0b01111101, 0b11100111, 0b11101110, - 0b10111101, 0b11111011, 0b00001111, 0b00011110, 0b00111110, 0b01111010, 0b11111100, 0b11100100, - 0b11101001, 0b10110011, 0b11100111, 0b00101111, 0b01011110, 0b10111110, 0b01111010, 0b11111101, - 0b11100110, 0b11101101, 0b10111011, 0b11110111, 0b00011111, 0b00111110, 0b01111110, 0b11111010, - 0b11111100, 0b11100101, 0b11101011, 0b10110111, 0b11101111, 0b00111111, 0b01111110, 0b11111110, - 0b11111010, 0b11111101, 0b11100111, 0b11101111, 0b10111111, 0b11111111, 0b00000000, 0b00100000, - 0b00001000, 0b00001100, 0b10000001, 0b11000010, 0b11100000, 0b00001000, 0b00100100, 0b00001010, - 0b10001101, 0b11000001, 0b11100010, 0b11110000, 0b00000100, 0b00100010, 0b10001001, 0b01001100, - 0b10100001, 0b11010010, 0b11101000, 0b00000011, 0b10000011, 0b01000011, 0b11000011, 0b00100011, - 0b10100011, -}; diff --git a/lib/std/compress/flate/inflate.zig b/lib/std/compress/flate/inflate.zig deleted file mode 100644 index 166ea94cd4..0000000000 --- a/lib/std/compress/flate/inflate.zig +++ /dev/null @@ -1,1045 +0,0 @@ -const std = @import("std"); -const assert = std.debug.assert; -const testing = std.testing; - -const hfd = @import("huffman_decoder.zig"); -const CircularBuffer = @import("CircularBuffer.zig"); -const Container = @import("container.zig").Container; -const Token = @import("Token.zig"); -const codegen_order = @import("consts.zig").huffman.codegen_order; - -/// Decompresses deflate bit stream `reader` and writes uncompressed data to the -/// `writer` stream. -pub fn decompress(comptime container: Container, reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) !void { - var d = decompressor(container, reader); - try d.decompress(writer); -} - -/// Inflate decompressor for the reader type. -pub fn decompressor(comptime container: Container, reader: *std.io.BufferedReader) Decompressor(container) { - return Decompressor(container).init(reader); -} - -pub fn Decompressor(comptime container: Container) type { - // zlib has 4 bytes footer, lookahead of 4 bytes ensures that we will not overshoot. - // gzip has 8 bytes footer so we will not overshoot even with 8 bytes of lookahead. - // For raw deflate there is always possibility of overshot so we use 8 bytes lookahead. - const lookahead: type = if (container == .zlib) u32 else u64; - return Inflate(container, lookahead); -} - -/// Inflate decompresses deflate bit stream. Reads compressed data from reader -/// provided in init. Decompressed data are stored in internal hist buffer and -/// can be accesses iterable `next` or reader interface. -/// -/// Container defines header/footer wrapper around deflate bit stream. Can be -/// gzip or zlib. -/// -/// Deflate bit stream consists of multiple blocks. Block can be one of three types: -/// * stored, non compressed, max 64k in size -/// * fixed, huffman codes are predefined -/// * dynamic, huffman code tables are encoded at the block start -/// -/// `step` function runs decoder until internal `hist` buffer is full. Client -/// than needs to read that data in order to proceed with decoding. -/// -/// Allocates 74.5K of internal buffers, most important are: -/// * 64K for history (CircularBuffer) -/// * ~10K huffman decoders (Literal and DistanceDecoder) -/// -pub fn Inflate(comptime container: Container, comptime Lookahead: type) type { - assert(Lookahead == u32 or Lookahead == u64); - const LookaheadBitReader = BitReader(Lookahead); - - return struct { - bits: LookaheadBitReader, - hist: CircularBuffer = .{}, - // Hashes, produces checksum, of uncompressed data for gzip/zlib footer. - hasher: container.Hasher() = .{}, - - // dynamic block huffman code decoders - lit_dec: hfd.LiteralDecoder = .{}, // literals - dst_dec: hfd.DistanceDecoder = .{}, // distances - - // current read state - bfinal: u1 = 0, - block_type: u2 = 0b11, - state: ReadState = .protocol_header, - - read_err: Error!void = {}, - - const ReadState = enum { - protocol_header, - block_header, - block, - protocol_footer, - end, - }; - - const Self = @This(); - - pub const Error = Container.Error || hfd.Error || error{ - InvalidCode, - InvalidMatch, - InvalidBlockType, - WrongStoredBlockNlen, - InvalidDynamicBlockHeader, - EndOfStream, - ReadFailed, - }; - - pub fn init(bw: *std.io.BufferedReader) Self { - return .{ .bits = LookaheadBitReader.init(bw) }; - } - - fn blockHeader(self: *Self) Error!void { - self.bfinal = try self.bits.read(u1); - self.block_type = try self.bits.read(u2); - } - - fn storedBlock(self: *Self) !bool { - self.bits.alignToByte(); // skip padding until byte boundary - // everything after this is byte aligned in stored block - var len = try self.bits.read(u16); - const nlen = try self.bits.read(u16); - if (len != ~nlen) return error.WrongStoredBlockNlen; - - while (len > 0) { - const buf = self.hist.getWritable(len); - try self.bits.readAll(buf); - len -= @intCast(buf.len); - } - return true; - } - - fn fixedBlock(self: *Self) !bool { - while (!self.hist.full()) { - const code = try self.bits.readFixedCode(); - switch (code) { - 0...255 => self.hist.write(@intCast(code)), - 256 => return true, // end of block - 257...285 => try self.fixedDistanceCode(@intCast(code - 257)), - else => return error.InvalidCode, - } - } - return false; - } - - // Handles fixed block non literal (length) code. - // Length code is followed by 5 bits of distance code. - fn fixedDistanceCode(self: *Self, code: u8) !void { - try self.bits.fill(5 + 5 + 13); - const length = try self.decodeLength(code); - const distance = try self.decodeDistance(try self.bits.readF(u5, .{ - .buffered = true, - .reverse = true, - })); - try self.hist.writeMatch(length, distance); - } - - inline fn decodeLength(self: *Self, code: u8) !u16 { - if (code > 28) return error.InvalidCode; - const ml = Token.matchLength(code); - return if (ml.extra_bits == 0) // 0 - 5 extra bits - ml.base - else - ml.base + try self.bits.readN(ml.extra_bits, .{ .buffered = true }); - } - - fn decodeDistance(self: *Self, code: u8) !u16 { - if (code > 29) return error.InvalidCode; - const md = Token.matchDistance(code); - return if (md.extra_bits == 0) // 0 - 13 extra bits - md.base - else - md.base + try self.bits.readN(md.extra_bits, .{ .buffered = true }); - } - - fn dynamicBlockHeader(self: *Self) !void { - const hlit: u16 = @as(u16, try self.bits.read(u5)) + 257; // number of ll code entries present - 257 - const hdist: u16 = @as(u16, try self.bits.read(u5)) + 1; // number of distance code entries - 1 - const hclen: u8 = @as(u8, try self.bits.read(u4)) + 4; // hclen + 4 code lengths are encoded - - if (hlit > 286 or hdist > 30) - return error.InvalidDynamicBlockHeader; - - // lengths for code lengths - var cl_lens = [_]u4{0} ** 19; - for (0..hclen) |i| { - cl_lens[codegen_order[i]] = try self.bits.read(u3); - } - var cl_dec: hfd.CodegenDecoder = .{}; - try cl_dec.generate(&cl_lens); - - // decoded code lengths - var dec_lens = [_]u4{0} ** (286 + 30); - var pos: usize = 0; - while (pos < hlit + hdist) { - const sym = try cl_dec.find(try self.bits.peekF(u7, .{ .reverse = true })); - try self.bits.shift(sym.code_bits); - pos += try self.dynamicCodeLength(sym.symbol, &dec_lens, pos); - } - if (pos > hlit + hdist) { - return error.InvalidDynamicBlockHeader; - } - - // literal code lengths to literal decoder - try self.lit_dec.generate(dec_lens[0..hlit]); - - // distance code lengths to distance decoder - try self.dst_dec.generate(dec_lens[hlit .. hlit + hdist]); - } - - // Decode code length symbol to code length. Writes decoded length into - // lens slice starting at position pos. Returns number of positions - // advanced. - fn dynamicCodeLength(self: *Self, code: u16, lens: []u4, pos: usize) !usize { - if (pos >= lens.len) - return error.InvalidDynamicBlockHeader; - - switch (code) { - 0...15 => { - // Represent code lengths of 0 - 15 - lens[pos] = @intCast(code); - return 1; - }, - 16 => { - // Copy the previous code length 3 - 6 times. - // The next 2 bits indicate repeat length - const n: u8 = @as(u8, try self.bits.read(u2)) + 3; - if (pos == 0 or pos + n > lens.len) - return error.InvalidDynamicBlockHeader; - for (0..n) |i| { - lens[pos + i] = lens[pos + i - 1]; - } - return n; - }, - // Repeat a code length of 0 for 3 - 10 times. (3 bits of length) - 17 => return @as(u8, try self.bits.read(u3)) + 3, - // Repeat a code length of 0 for 11 - 138 times (7 bits of length) - 18 => return @as(u8, try self.bits.read(u7)) + 11, - else => return error.InvalidDynamicBlockHeader, - } - } - - // In larger archives most blocks are usually dynamic, so decompression - // performance depends on this function. - fn dynamicBlock(self: *Self) !bool { - // Hot path loop! - while (!self.hist.full()) { - try self.bits.fill(15); // optimization so other bit reads can be buffered (avoiding one `if` in hot path) - const sym = try self.decodeSymbol(&self.lit_dec); - - switch (sym.kind) { - .literal => self.hist.write(sym.symbol), - .match => { // Decode match backreference - // fill so we can use buffered reads - if (Lookahead == u32) - try self.bits.fill(5 + 15) - else - try self.bits.fill(5 + 15 + 13); - const length = try self.decodeLength(sym.symbol); - const dsm = try self.decodeSymbol(&self.dst_dec); - if (Lookahead == u32) try self.bits.fill(13); - const distance = try self.decodeDistance(dsm.symbol); - try self.hist.writeMatch(length, distance); - }, - .end_of_block => return true, - } - } - return false; - } - - // Peek 15 bits from bits reader (maximum code len is 15 bits). Use - // decoder to find symbol for that code. We then know how many bits is - // used. Shift bit reader for that much bits, those bits are used. And - // return symbol. - fn decodeSymbol(self: *Self, decoder: anytype) !hfd.Symbol { - const sym = try decoder.find(try self.bits.peekF(u15, .{ .buffered = true, .reverse = true })); - try self.bits.shift(sym.code_bits); - return sym; - } - - fn step(self: *Self) !void { - switch (self.state) { - .protocol_header => { - try container.parseHeader(&self.bits); - self.state = .block_header; - }, - .block_header => { - try self.blockHeader(); - self.state = .block; - if (self.block_type == 2) try self.dynamicBlockHeader(); - }, - .block => { - const done = switch (self.block_type) { - 0 => try self.storedBlock(), - 1 => try self.fixedBlock(), - 2 => try self.dynamicBlock(), - else => return error.InvalidBlockType, - }; - if (done) { - self.state = if (self.bfinal == 1) .protocol_footer else .block_header; - } - }, - .protocol_footer => { - self.bits.alignToByte(); - try container.parseFooter(&self.hasher, &self.bits); - self.state = .end; - }, - .end => {}, - } - } - - /// Replaces the inner reader with new reader. - pub fn setReader(self: *Self, new_reader: *std.io.BufferedReader) void { - self.bits.forward_reader = new_reader; - if (self.state == .end or self.state == .protocol_footer) { - self.state = .protocol_header; - } - } - - // Reads all compressed data from the internal reader and outputs plain - // (uncompressed) data to the provided writer. - pub fn decompress(self: *Self, writer: *std.io.BufferedWriter) !void { - while (try self.next()) |buf| { - try writer.writeAll(buf); - } - } - - /// Returns the number of bytes that have been read from the internal - /// reader but not yet consumed by the decompressor. - pub fn unreadBytes(self: Self) usize { - // There can be no error here: the denominator is not zero, and - // overflow is not possible since the type is unsigned. - return std.math.divCeil(usize, self.bits.nbits, 8) catch unreachable; - } - - // Iterator interface - - /// Can be used in iterator like loop without memcpy to another buffer: - /// while (try inflate.next()) |buf| { ... } - pub fn next(self: *Self) Error!?[]const u8 { - const out = try self.get(0); - if (out.len == 0) return null; - return out; - } - - /// Returns decompressed data from internal sliding window buffer. - /// Returned buffer can be any length between 0 and `limit` bytes. 0 - /// returned bytes means end of stream reached. With limit=0 returns as - /// much data it can. It newer will be more than 65536 bytes, which is - /// size of internal buffer. - /// TODO merge this logic into readerRead and readerReadVec - pub fn get(self: *Self, limit: usize) Error![]const u8 { - while (true) { - const out = self.hist.readAtMost(limit); - if (out.len > 0) { - self.hasher.update(out); - return out; - } - if (self.state == .end) return out; - try self.step(); - } - } - - fn readerRead( - context: ?*anyopaque, - bw: *std.io.BufferedWriter, - limit: std.io.Reader.Limit, - ) std.io.Reader.RwError!usize { - const self: *Self = @alignCast(@ptrCast(context)); - const out = try bw.writableSliceGreedy(1); - const in = self.get(limit.minInt(out.len)) catch |err| switch (err) { - error.EndOfStream => return error.EndOfStream, - error.ReadFailed => return error.ReadFailed, - else => |e| { - self.read_err = e; - return error.ReadFailed; - }, - }; - if (in.len == 0) return error.EndOfStream; - @memcpy(out[0..in.len], in); - bw.advance(in.len); - return in.len; - } - - fn readerReadVec(context: ?*anyopaque, data: []const []u8) std.io.Reader.Error!usize { - const self: *Self = @alignCast(@ptrCast(context)); - return readVec(self, data) catch |err| switch (err) { - error.EndOfStream => return error.EndOfStream, - error.ReadFailed => return error.ReadFailed, - else => |e| { - self.read_err = e; - return error.ReadFailed; - }, - }; - } - - fn readerDiscard(context: ?*anyopaque, limit: std.io.Reader.Limit) std.io.Reader.Error!usize { - _ = context; - _ = limit; - @panic("TODO"); - } - - pub fn readVec(self: *Self, data: []const []u8) Error!usize { - for (data) |out| { - if (out.len == 0) continue; - const in = try self.get(out.len); - @memcpy(out[0..in.len], in); - if (in.len == 0) return error.EndOfStream; - return in.len; - } - return 0; - } - - pub fn reader(self: *Self) std.io.Reader { - return .{ - .context = self, - .vtable = &.{ - .read = readerRead, - .readVec = readerReadVec, - .discard = readerDiscard, - }, - }; - } - - pub fn readable(self: *Self, buffer: []u8) std.io.BufferedReader { - return reader(self).buffered(buffer); - } - }; -} - -test "decompress" { - const cases = [_]struct { - in: []const u8, - out: []const u8, - }{ - // non compressed block (type 0) - .{ - .in = &[_]u8{ - 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen - 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data - }, - .out = "Hello world\n", - }, - // fixed code block (type 1) - .{ - .in = &[_]u8{ - 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1 - 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00, - }, - .out = "Hello world\n", - }, - // dynamic block (type 2) - .{ - .in = &[_]u8{ - 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2 - 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38, - 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01, - }, - .out = "ABCDEABCD ABCDEABCD", - }, - }; - for (cases) |c| { - var fb = std.io.fixedBufferStream(c.in); - var al = std.ArrayList(u8).init(testing.allocator); - defer al.deinit(); - - try decompress(.raw, fb.reader(), al.writer()); - try testing.expectEqualStrings(c.out, al.items); - } -} - -test "gzip decompress" { - const cases = [_]struct { - in: []const u8, - out: []const u8, - }{ - // non compressed block (type 0) - .{ - .in = &[_]u8{ - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes) - 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen - 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data - 0xd5, 0xe0, 0x39, 0xb7, // gzip footer: checksum - 0x0c, 0x00, 0x00, 0x00, // gzip footer: size - }, - .out = "Hello world\n", - }, - // fixed code block (type 1) - .{ - .in = &[_]u8{ - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x03, // gzip header (10 bytes) - 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1 - 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00, - 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00, // gzip footer (chksum, len) - }, - .out = "Hello world\n", - }, - // dynamic block (type 2) - .{ - .in = &[_]u8{ - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes) - 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2 - 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38, - 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01, - 0x17, 0x1c, 0x39, 0xb4, 0x13, 0x00, 0x00, 0x00, // gzip footer (chksum, len) - }, - .out = "ABCDEABCD ABCDEABCD", - }, - // gzip header with name - .{ - .in = &[_]u8{ - 0x1f, 0x8b, 0x08, 0x08, 0xe5, 0x70, 0xb1, 0x65, 0x00, 0x03, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x2e, - 0x74, 0x78, 0x74, 0x00, 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0xe1, - 0x02, 0x00, 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00, - }, - .out = "Hello world\n", - }, - }; - for (cases) |c| { - var fb = std.io.fixedBufferStream(c.in); - var al = std.ArrayList(u8).init(testing.allocator); - defer al.deinit(); - - try decompress(.gzip, fb.reader(), al.writer()); - try testing.expectEqualStrings(c.out, al.items); - } -} - -test "zlib decompress" { - const cases = [_]struct { - in: []const u8, - out: []const u8, - }{ - // non compressed block (type 0) - .{ - .in = &[_]u8{ - 0x78, 0b10_0_11100, // zlib header (2 bytes) - 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen - 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data - 0x1c, 0xf2, 0x04, 0x47, // zlib footer: checksum - }, - .out = "Hello world\n", - }, - }; - for (cases) |c| { - var fb = std.io.fixedBufferStream(c.in); - var al = std.ArrayList(u8).init(testing.allocator); - defer al.deinit(); - - try decompress(.zlib, fb.reader(), al.writer()); - try testing.expectEqualStrings(c.out, al.items); - } -} - -test "fuzzing tests" { - const cases = [_]struct { - input: []const u8, - out: []const u8 = "", - err: ?anyerror = null, - }{ - .{ .input = "deflate-stream", .out = @embedFile("testdata/fuzz/deflate-stream.expect") }, // 0 - .{ .input = "empty-distance-alphabet01" }, - .{ .input = "empty-distance-alphabet02" }, - .{ .input = "end-of-stream", .err = error.EndOfStream }, - .{ .input = "invalid-distance", .err = error.InvalidMatch }, - .{ .input = "invalid-tree01", .err = error.IncompleteHuffmanTree }, // 5 - .{ .input = "invalid-tree02", .err = error.IncompleteHuffmanTree }, - .{ .input = "invalid-tree03", .err = error.IncompleteHuffmanTree }, - .{ .input = "lengths-overflow", .err = error.InvalidDynamicBlockHeader }, - .{ .input = "out-of-codes", .err = error.InvalidCode }, - .{ .input = "puff01", .err = error.WrongStoredBlockNlen }, // 10 - .{ .input = "puff02", .err = error.EndOfStream }, - .{ .input = "puff03", .out = &[_]u8{0xa} }, - .{ .input = "puff04", .err = error.InvalidCode }, - .{ .input = "puff05", .err = error.EndOfStream }, - .{ .input = "puff06", .err = error.EndOfStream }, - .{ .input = "puff08", .err = error.InvalidCode }, - .{ .input = "puff09", .out = "P" }, - .{ .input = "puff10", .err = error.InvalidCode }, - .{ .input = "puff11", .err = error.InvalidMatch }, - .{ .input = "puff12", .err = error.InvalidDynamicBlockHeader }, // 20 - .{ .input = "puff13", .err = error.IncompleteHuffmanTree }, - .{ .input = "puff14", .err = error.EndOfStream }, - .{ .input = "puff15", .err = error.IncompleteHuffmanTree }, - .{ .input = "puff16", .err = error.InvalidDynamicBlockHeader }, - .{ .input = "puff17", .err = error.MissingEndOfBlockCode }, // 25 - .{ .input = "fuzz1", .err = error.InvalidDynamicBlockHeader }, - .{ .input = "fuzz2", .err = error.InvalidDynamicBlockHeader }, - .{ .input = "fuzz3", .err = error.InvalidMatch }, - .{ .input = "fuzz4", .err = error.OversubscribedHuffmanTree }, - .{ .input = "puff18", .err = error.OversubscribedHuffmanTree }, // 30 - .{ .input = "puff19", .err = error.OversubscribedHuffmanTree }, - .{ .input = "puff20", .err = error.OversubscribedHuffmanTree }, - .{ .input = "puff21", .err = error.OversubscribedHuffmanTree }, - .{ .input = "puff22", .err = error.OversubscribedHuffmanTree }, - .{ .input = "puff23", .err = error.OversubscribedHuffmanTree }, // 35 - .{ .input = "puff24", .err = error.IncompleteHuffmanTree }, - .{ .input = "puff25", .err = error.OversubscribedHuffmanTree }, - .{ .input = "puff26", .err = error.InvalidDynamicBlockHeader }, - .{ .input = "puff27", .err = error.InvalidDynamicBlockHeader }, - }; - - inline for (cases, 0..) |c, case_no| { - var in = std.io.fixedBufferStream(@embedFile("testdata/fuzz/" ++ c.input ++ ".input")); - var out = std.ArrayList(u8).init(testing.allocator); - defer out.deinit(); - errdefer std.debug.print("test case failed {}\n", .{case_no}); - - if (c.err) |expected_err| { - try testing.expectError(expected_err, decompress(.raw, in.reader(), out.writer())); - } else { - try decompress(.raw, in.reader(), out.writer()); - try testing.expectEqualStrings(c.out, out.items); - } - } -} - -test "bug 18966" { - const input = @embedFile("testdata/fuzz/bug_18966.input"); - const expect = @embedFile("testdata/fuzz/bug_18966.expect"); - - var in = std.io.fixedBufferStream(input); - var out = std.ArrayList(u8).init(testing.allocator); - defer out.deinit(); - - try decompress(.gzip, in.reader(), out.writer()); - try testing.expectEqualStrings(expect, out.items); -} - -test "bug 19895" { - const input = &[_]u8{ - 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen - 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data - }; - var in = std.io.fixedBufferStream(input); - var decomp = decompressor(.raw, in.reader()); - var buf: [0]u8 = undefined; - try testing.expectEqual(0, try decomp.read(&buf)); -} - -/// Bit reader used during inflate (decompression). Has internal buffer of 64 -/// bits which shifts right after bits are consumed. Uses forward_reader to fill -/// that internal buffer when needed. -/// -/// readF is the core function. Supports few different ways of getting bits -/// controlled by flags. In hot path we try to avoid checking whether we need to -/// fill buffer from forward_reader by calling fill in advance and readF with -/// buffered flag set. -/// -pub fn BitReader(comptime T: type) type { - assert(T == u32 or T == u64); - const t_bytes: usize = @sizeOf(T); - const Tshift = if (T == u64) u6 else u5; - - return struct { - // Underlying reader used for filling internal bits buffer - forward_reader: *std.io.BufferedReader, - // Internal buffer of 64 bits - bits: T = 0, - // Number of bits in the buffer - nbits: u32 = 0, - - const Self = @This(); - - pub const Flags = packed struct(u3) { - /// dont advance internal buffer, just get bits, leave them in buffer - peek: bool = false, - /// assume that there is no need to fill, fill should be called before - buffered: bool = false, - /// bit reverse read bits - reverse: bool = false, - - /// work around https://github.com/ziglang/zig/issues/18882 - pub inline fn toInt(f: Flags) u3 { - return @bitCast(f); - } - }; - - pub fn init(forward_reader: *std.io.BufferedReader) Self { - var self = Self{ .forward_reader = forward_reader }; - self.fill(1) catch {}; - return self; - } - - /// Try to have `nice` bits are available in buffer. Reads from - /// forward reader if there is no `nice` bits in buffer. Returns error - /// if end of forward stream is reached and internal buffer is empty. - /// It will not error if less than `nice` bits are in buffer, only when - /// all bits are exhausted. During inflate we usually know what is the - /// maximum bits for the next step but usually that step will need less - /// bits to decode. So `nice` is not hard limit, it will just try to have - /// that number of bits available. If end of forward stream is reached - /// it may be some extra zero bits in buffer. - pub fn fill(self: *Self, nice: u6) !void { - if (self.nbits >= nice and nice != 0) { - return; // We have enough bits - } - // Read more bits from forward reader - - // Number of empty bytes in bits, round nbits to whole bytes. - const empty_bytes = - @as(u8, if (self.nbits & 0x7 == 0) t_bytes else t_bytes - 1) - // 8 for 8, 16, 24..., 7 otherwise - (self.nbits >> 3); // 0 for 0-7, 1 for 8-16, ... same as / 8 - - var buf: [t_bytes]u8 = [_]u8{0} ** t_bytes; - const bytes_read = self.forward_reader.readSliceShort(buf[0..empty_bytes]) catch 0; - if (bytes_read > 0) { - const u: T = std.mem.readInt(T, buf[0..t_bytes], .little); - self.bits |= u << @as(Tshift, @intCast(self.nbits)); - self.nbits += 8 * @as(u8, @intCast(bytes_read)); - return; - } - - if (self.nbits == 0) - return error.EndOfStream; - } - - /// Read exactly buf.len bytes into buf. - pub fn readAll(self: *Self, buf: []u8) std.io.Reader.Error!void { - assert(self.alignBits() == 0); // internal bits must be at byte boundary - - // First read from internal bits buffer. - var n: usize = 0; - while (self.nbits > 0 and n < buf.len) { - buf[n] = try self.readF(u8, .{ .buffered = true }); - n += 1; - } - // Then use forward reader for all other bytes. - try self.forward_reader.readSlice(buf[n..]); - } - - /// Alias for readF(U, 0). - pub fn read(self: *Self, comptime U: type) !U { - return self.readF(U, .{}); - } - - /// Alias for readF with flag.peak set. - pub inline fn peekF(self: *Self, comptime U: type, comptime how: Flags) !U { - return self.readF(U, .{ - .peek = true, - .buffered = how.buffered, - .reverse = how.reverse, - }); - } - - /// Read with flags provided. - pub fn readF(self: *Self, comptime U: type, comptime how: Flags) !U { - if (U == T) { - assert(how.toInt() == 0); - assert(self.alignBits() == 0); - try self.fill(@bitSizeOf(T)); - if (self.nbits != @bitSizeOf(T)) return error.EndOfStream; - const v = self.bits; - self.nbits = 0; - self.bits = 0; - return v; - } - const n: Tshift = @bitSizeOf(U); - // work around https://github.com/ziglang/zig/issues/18882 - switch (how.toInt()) { - @as(Flags, .{}).toInt() => { // `normal` read - try self.fill(n); // ensure that there are n bits in the buffer - const u: U = @truncate(self.bits); // get n bits - try self.shift(n); // advance buffer for n - return u; - }, - @as(Flags, .{ .peek = true }).toInt() => { // no shift, leave bits in the buffer - try self.fill(n); - return @truncate(self.bits); - }, - @as(Flags, .{ .buffered = true }).toInt() => { // no fill, assume that buffer has enough bits - const u: U = @truncate(self.bits); - try self.shift(n); - return u; - }, - @as(Flags, .{ .reverse = true }).toInt() => { // same as 0 with bit reverse - try self.fill(n); - const u: U = @truncate(self.bits); - try self.shift(n); - return @bitReverse(u); - }, - @as(Flags, .{ .peek = true, .reverse = true }).toInt() => { - try self.fill(n); - return @bitReverse(@as(U, @truncate(self.bits))); - }, - @as(Flags, .{ .buffered = true, .reverse = true }).toInt() => { - const u: U = @truncate(self.bits); - try self.shift(n); - return @bitReverse(u); - }, - @as(Flags, .{ .peek = true, .buffered = true }).toInt() => { - return @truncate(self.bits); - }, - @as(Flags, .{ .peek = true, .buffered = true, .reverse = true }).toInt() => { - return @bitReverse(@as(U, @truncate(self.bits))); - }, - } - } - - /// Read n number of bits. - /// Only buffered flag can be used in how. - pub fn readN(self: *Self, n: u4, comptime how: Flags) !u16 { - // work around https://github.com/ziglang/zig/issues/18882 - switch (how.toInt()) { - @as(Flags, .{}).toInt() => { - try self.fill(n); - }, - @as(Flags, .{ .buffered = true }).toInt() => {}, - else => unreachable, - } - const mask: u16 = (@as(u16, 1) << n) - 1; - const u: u16 = @as(u16, @truncate(self.bits)) & mask; - try self.shift(n); - return u; - } - - /// Advance buffer for n bits. - pub fn shift(self: *Self, n: Tshift) !void { - if (n > self.nbits) return error.EndOfStream; - self.bits >>= n; - self.nbits -= n; - } - - /// Skip n bytes. - pub fn skipBytes(self: *Self, n: u16) !void { - for (0..n) |_| { - try self.fill(8); - try self.shift(8); - } - } - - // Number of bits to align stream to the byte boundary. - fn alignBits(self: *Self) u3 { - return @intCast(self.nbits & 0x7); - } - - /// Align stream to the byte boundary. - pub fn alignToByte(self: *Self) void { - const ab = self.alignBits(); - if (ab > 0) self.shift(ab) catch unreachable; - } - - /// Skip zero terminated string. - pub fn skipStringZ(self: *Self) !void { - while (true) { - if (try self.readF(u8, .{}) == 0) break; - } - } - - /// Read deflate fixed fixed code. - /// Reads first 7 bits, and then maybe 1 or 2 more to get full 7,8 or 9 bit code. - /// ref: https://datatracker.ietf.org/doc/html/rfc1951#page-12 - /// Lit Value Bits Codes - /// --------- ---- ----- - /// 0 - 143 8 00110000 through - /// 10111111 - /// 144 - 255 9 110010000 through - /// 111111111 - /// 256 - 279 7 0000000 through - /// 0010111 - /// 280 - 287 8 11000000 through - /// 11000111 - pub fn readFixedCode(self: *Self) !u16 { - try self.fill(7 + 2); - const code7 = try self.readF(u7, .{ .buffered = true, .reverse = true }); - if (code7 <= 0b0010_111) { // 7 bits, 256-279, codes 0000_000 - 0010_111 - return @as(u16, code7) + 256; - } else if (code7 <= 0b1011_111) { // 8 bits, 0-143, codes 0011_0000 through 1011_1111 - return (@as(u16, code7) << 1) + @as(u16, try self.readF(u1, .{ .buffered = true })) - 0b0011_0000; - } else if (code7 <= 0b1100_011) { // 8 bit, 280-287, codes 1100_0000 - 1100_0111 - return (@as(u16, code7 - 0b1100000) << 1) + try self.readF(u1, .{ .buffered = true }) + 280; - } else { // 9 bit, 144-255, codes 1_1001_0000 - 1_1111_1111 - return (@as(u16, code7 - 0b1100_100) << 2) + @as(u16, try self.readF(u2, .{ .buffered = true, .reverse = true })) + 144; - } - } - }; -} - -test "readF" { - var input: std.io.BufferedReader = undefined; - input.initFixed(&[_]u8{ 0xf3, 0x48, 0xcd, 0xc9, 0x00, 0x00 }); - var br: BitReader(u64) = .init(&input); - - try testing.expectEqual(@as(u8, 48), br.nbits); - try testing.expectEqual(@as(u64, 0xc9cd48f3), br.bits); - - try testing.expect(try br.readF(u1, 0) == 0b0000_0001); - try testing.expect(try br.readF(u2, 0) == 0b0000_0001); - try testing.expectEqual(@as(u8, 48 - 3), br.nbits); - try testing.expectEqual(@as(u3, 5), br.alignBits()); - - try testing.expect(try br.readF(u8, .{ .peek = true }) == 0b0001_1110); - try testing.expect(try br.readF(u9, .{ .peek = true }) == 0b1_0001_1110); - try br.shift(9); - try testing.expectEqual(@as(u8, 36), br.nbits); - try testing.expectEqual(@as(u3, 4), br.alignBits()); - - try testing.expect(try br.readF(u4, 0) == 0b0100); - try testing.expectEqual(@as(u8, 32), br.nbits); - try testing.expectEqual(@as(u3, 0), br.alignBits()); - - try br.shift(1); - try testing.expectEqual(@as(u3, 7), br.alignBits()); - try br.shift(1); - try testing.expectEqual(@as(u3, 6), br.alignBits()); - br.alignToByte(); - try testing.expectEqual(@as(u3, 0), br.alignBits()); - - try testing.expectEqual(@as(u64, 0xc9), br.bits); - try testing.expectEqual(@as(u16, 0x9), try br.readN(4, 0)); - try testing.expectEqual(@as(u16, 0xc), try br.readN(4, 0)); -} - -test "read block type 1 data" { - inline for ([_]type{ u64, u32 }) |T| { - const data = [_]u8{ - 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1 - 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00, - 0x0c, 0x01, 0x02, 0x03, // - 0xaa, 0xbb, 0xcc, 0xdd, - }; - var fbs: std.io.BufferedReader = undefined; - fbs.initFixed(&data); - var br: BitReader(T) = .init(&fbs); - - try testing.expectEqual(@as(u1, 1), try br.readF(u1, 0)); // bfinal - try testing.expectEqual(@as(u2, 1), try br.readF(u2, 0)); // block_type - - for ("Hello world\n") |c| { - try testing.expectEqual(@as(u8, c), try br.readF(u8, .{ .reverse = true }) - 0x30); - } - try testing.expectEqual(@as(u7, 0), try br.readF(u7, 0)); // end of block - br.alignToByte(); - try testing.expectEqual(@as(u32, 0x0302010c), try br.readF(u32, 0)); - try testing.expectEqual(@as(u16, 0xbbaa), try br.readF(u16, 0)); - try testing.expectEqual(@as(u16, 0xddcc), try br.readF(u16, 0)); - } -} - -test "shift/fill" { - const data = [_]u8{ - 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, - 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, - }; - var fbs: std.io.BufferedReader = undefined; - fbs.initFixed(&data); - var br: BitReader(u64) = .init(&fbs); - - try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits); - try br.shift(8); - try testing.expectEqual(@as(u64, 0x00_08_07_06_05_04_03_02), br.bits); - try br.fill(60); // fill with 1 byte - try testing.expectEqual(@as(u64, 0x01_08_07_06_05_04_03_02), br.bits); - try br.shift(8 * 4 + 4); - try testing.expectEqual(@as(u64, 0x00_00_00_00_00_10_80_70), br.bits); - - try br.fill(60); // fill with 4 bytes (shift by 4) - try testing.expectEqual(@as(u64, 0x00_50_40_30_20_10_80_70), br.bits); - try testing.expectEqual(@as(u8, 8 * 7 + 4), br.nbits); - - try br.shift(@intCast(br.nbits)); // clear buffer - try br.fill(8); // refill with the rest of the bytes - try testing.expectEqual(@as(u64, 0x00_00_00_00_00_08_07_06), br.bits); -} - -test "readAll" { - inline for ([_]type{ u64, u32 }) |T| { - const data = [_]u8{ - 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, - 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, - }; - var fbs: std.io.BufferedReader = undefined; - fbs.initFixed(&data); - var br: BitReader(T) = .init(&fbs); - - switch (T) { - u64 => try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits), - u32 => try testing.expectEqual(@as(u32, 0x04_03_02_01), br.bits), - else => unreachable, - } - - var out: [16]u8 = undefined; - try br.readAll(out[0..]); - try testing.expect(br.nbits == 0); - try testing.expect(br.bits == 0); - - try testing.expectEqualSlices(u8, data[0..16], &out); - } -} - -test "readFixedCode" { - inline for ([_]type{ u64, u32 }) |T| { - const fixed_codes = @import("huffman_encoder.zig").fixed_codes; - - var fbs: std.io.BufferedReader = undefined; - fbs.initFixed(&fixed_codes); - var rdr: BitReader(T) = .init(&fbs); - - for (0..286) |c| { - try testing.expectEqual(c, try rdr.readFixedCode()); - } - try testing.expect(rdr.nbits == 0); - } -} - -test "u32 leaves no bits on u32 reads" { - const data = [_]u8{ - 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - }; - var fbs: std.io.BufferedReader = undefined; - fbs.initFixed(&data); - var br: BitReader(u32) = .init(&fbs); - - _ = try br.read(u3); - try testing.expectEqual(29, br.nbits); - br.alignToByte(); - try testing.expectEqual(24, br.nbits); - try testing.expectEqual(0x04_03_02_01, try br.read(u32)); - try testing.expectEqual(0, br.nbits); - try testing.expectEqual(0x08_07_06_05, try br.read(u32)); - try testing.expectEqual(0, br.nbits); - - _ = try br.read(u9); - try testing.expectEqual(23, br.nbits); - br.alignToByte(); - try testing.expectEqual(16, br.nbits); - try testing.expectEqual(0x0e_0d_0c_0b, try br.read(u32)); - try testing.expectEqual(0, br.nbits); -} - -test "u64 need fill after alignToByte" { - const data = [_]u8{ - 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - }; - - // without fill - var fbs: std.io.BufferedReader = undefined; - fbs.initFixed(&data); - var br: BitReader(u64) = .init(&fbs); - _ = try br.read(u23); - try testing.expectEqual(41, br.nbits); - br.alignToByte(); - try testing.expectEqual(40, br.nbits); - try testing.expectEqual(0x06_05_04_03, try br.read(u32)); - try testing.expectEqual(8, br.nbits); - try testing.expectEqual(0x0a_09_08_07, try br.read(u32)); - try testing.expectEqual(32, br.nbits); - - // fill after align ensures all bits filled - fbs.reset(); - br = .init(&fbs); - _ = try br.read(u23); - try testing.expectEqual(41, br.nbits); - br.alignToByte(); - try br.fill(0); - try testing.expectEqual(64, br.nbits); - try testing.expectEqual(0x06_05_04_03, try br.read(u32)); - try testing.expectEqual(32, br.nbits); - try testing.expectEqual(0x0a_09_08_07, try br.read(u32)); - try testing.expectEqual(0, br.nbits); -} diff --git a/lib/std/compress/gzip.zig b/lib/std/compress/gzip.zig deleted file mode 100644 index fd5c706023..0000000000 --- a/lib/std/compress/gzip.zig +++ /dev/null @@ -1,39 +0,0 @@ -const std = @import("../std.zig"); -const deflate = @import("flate/deflate.zig"); -const inflate = @import("flate/inflate.zig"); - -/// Decompress compressed data from reader and write plain data to the writer. -pub fn decompress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) !void { - try inflate.decompress(.gzip, reader, writer); -} - -pub const Decompressor = inflate.Decompressor(.gzip); - -/// Compression level, trades between speed and compression size. -pub const Options = deflate.Options; - -/// Compress plain data from reader and write compressed data to the writer. -pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter, options: Options) !void { - try deflate.compress(.gzip, reader, writer, options); -} - -pub const Compressor = deflate.Compressor(.gzip); - -/// Huffman only compression. Without Lempel-Ziv match searching. Faster -/// compression, less memory requirements but bigger compressed sizes. -pub const huffman = struct { - pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) !void { - try deflate.huffman.compress(.gzip, reader, writer); - } - - pub const Compressor = deflate.huffman.Compressor(.gzip); -}; - -// No compression store only. Compressed size is slightly bigger than plain. -pub const store = struct { - pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) !void { - try deflate.store.compress(.gzip, reader, writer); - } - - pub const Compressor = deflate.store.Compressor(.gzip); -}; diff --git a/lib/std/compress/zlib.zig b/lib/std/compress/zlib.zig deleted file mode 100644 index 35fede8eeb..0000000000 --- a/lib/std/compress/zlib.zig +++ /dev/null @@ -1,77 +0,0 @@ -const std = @import("../std.zig"); -const deflate = @import("flate/deflate.zig"); -const inflate = @import("flate/inflate.zig"); - -/// When decompressing, the output buffer is used as the history window, so -/// less than this may result in failure to decompress streams that were -/// compressed with a larger window. -pub const max_window_len = std.compress.flate.max_window_len; - -/// Decompress compressed data from reader and write plain data to the writer. -pub fn decompress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) !void { - try inflate.decompress(.zlib, reader, writer); -} - -pub const Decompressor = inflate.Decompressor(.zlib); - -/// Compression level, trades between speed and compression size. -pub const Options = deflate.Options; - -/// Compress plain data from reader and write compressed data to the writer. -pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter, options: Options) !void { - try deflate.compress(.zlib, reader, writer, options); -} - -pub const Compressor = deflate.Compressor(.zlib); - -/// Huffman only compression. Without Lempel-Ziv match searching. Faster -/// compression, less memory requirements but bigger compressed sizes. -pub const huffman = struct { - pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) !void { - try deflate.huffman.compress(.zlib, reader, writer); - } - - pub const Compressor = deflate.huffman.Compressor(.zlib); -}; - -// No compression store only. Compressed size is slightly bigger than plain. -pub const store = struct { - pub fn compress(reader: *std.io.BufferedReader, writer: *std.io.BufferedWriter) !void { - try deflate.store.compress(.zlib, reader, writer); - } - - pub const Compressor = deflate.store.Compressor(.zlib); -}; - -test "should not overshoot" { - // Compressed zlib data with extra 4 bytes at the end. - const data = [_]u8{ - 0x78, 0x9c, 0x73, 0xce, 0x2f, 0xa8, 0x2c, 0xca, 0x4c, 0xcf, 0x28, 0x51, 0x08, 0xcf, 0xcc, 0xc9, - 0x49, 0xcd, 0x55, 0x28, 0x4b, 0xcc, 0x53, 0x08, 0x4e, 0xce, 0x48, 0xcc, 0xcc, 0xd6, 0x51, 0x08, - 0xce, 0xcc, 0x4b, 0x4f, 0x2c, 0xc8, 0x2f, 0x4a, 0x55, 0x30, 0xb4, 0xb4, 0x34, 0xd5, 0xb5, 0x34, - 0x03, 0x00, 0x8b, 0x61, 0x0f, 0xa4, 0x52, 0x5a, 0x94, 0x12, - }; - - var stream = std.io.fixedBufferStream(data[0..]); - const reader = stream.reader(); - - var dcp = Decompressor.init(reader); - var out: [128]u8 = undefined; - - // Decompress - var n = try dcp.reader().readAll(out[0..]); - - // Expected decompressed data - try std.testing.expectEqual(46, n); - try std.testing.expectEqualStrings("Copyright Willem van Schaik, Singapore 1995-96", out[0..n]); - - // Decompressor don't overshoot underlying reader. - // It is leaving it at the end of compressed data chunk. - try std.testing.expectEqual(data.len - 4, stream.getPos()); - try std.testing.expectEqual(0, dcp.unreadBytes()); - - // 4 bytes after compressed chunk are available in reader. - n = try reader.readAll(out[0..]); - try std.testing.expectEqual(n, 4); - try std.testing.expectEqualSlices(u8, data[data.len - 4 .. data.len], out[0..n]); -}