mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 06:13:07 +00:00
Zig deflate compression/decompression implementation. It supports compression and decompression of gzip, zlib and raw deflate format. Fixes #18062. This PR replaces current compress/gzip and compress/zlib packages. Deflate package is renamed to flate. Flate is common name for deflate/inflate where deflate is compression and inflate decompression. There are breaking change. Methods signatures are changed because of removal of the allocator, and I also unified API for all three namespaces (flate, gzip, zlib). Currently I put old packages under v1 namespace they are still available as compress/v1/gzip, compress/v1/zlib, compress/v1/deflate. Idea is to give users of the current API little time to postpone analyzing what they had to change. Although that rises question when it is safe to remove that v1 namespace. Here is current API in the compress package: ```Zig // deflate fn compressor(allocator, writer, options) !Compressor(@TypeOf(writer)) fn Compressor(comptime WriterType) type fn decompressor(allocator, reader, null) !Decompressor(@TypeOf(reader)) fn Decompressor(comptime ReaderType: type) type // gzip fn compress(allocator, writer, options) !Compress(@TypeOf(writer)) fn Compress(comptime WriterType: type) type fn decompress(allocator, reader) !Decompress(@TypeOf(reader)) fn Decompress(comptime ReaderType: type) type // zlib fn compressStream(allocator, writer, options) !CompressStream(@TypeOf(writer)) fn CompressStream(comptime WriterType: type) type fn decompressStream(allocator, reader) !DecompressStream(@TypeOf(reader)) fn DecompressStream(comptime ReaderType: type) type // xz fn decompress(allocator: Allocator, reader: anytype) !Decompress(@TypeOf(reader)) fn Decompress(comptime ReaderType: type) type // lzma fn decompress(allocator, reader) !Decompress(@TypeOf(reader)) fn Decompress(comptime ReaderType: type) type // lzma2 fn decompress(allocator, reader, writer !void // zstandard: fn DecompressStream(ReaderType, options) type fn decompressStream(allocator, reader) DecompressStream(@TypeOf(reader), .{}) struct decompress ``` The proposed naming convention: - Compressor/Decompressor for functions which return type, like Reader/Writer/GeneralPurposeAllocator - compressor/compressor for functions which are initializers for that type, like reader/writer/allocator - compress/decompress for one shot operations, accepts reader/writer pair, like read/write/alloc ```Zig /// Compress from reader and write compressed data to the writer. fn compress(reader: anytype, writer: anytype, options: Options) !void /// Create Compressor which outputs the writer. fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) /// Compressor type fn Compressor(comptime WriterType: type) type /// Decompress from reader and write plain data to the writer. fn decompress(reader: anytype, writer: anytype) !void /// Create Decompressor which reads from reader. fn decompressor(reader: anytype) Decompressor(@TypeOf(reader) /// Decompressor type fn Decompressor(comptime ReaderType: type) type ``` Comparing this implementation with the one we currently have in Zig's standard library (std). Std is roughly 1.2-1.4 times slower in decompression, and 1.1-1.2 times slower in compression. Compressed sizes are pretty much same in both cases. More resutls in [this](https://github.com/ianic/flate) repo. This library uses static allocations for all structures, doesn't require allocator. That makes sense especially for deflate where all structures, internal buffers are allocated to the full size. Little less for inflate where we std version uses less memory by not preallocating to theoretical max size array which are usually not fully used. For deflate this library allocates 395K while std 779K. For inflate this library allocates 74.5K while std around 36K. Inflate difference is because we here use 64K history instead of 32K in std. If merged existing usage of compress gzip/zlib/deflate need some changes. Here is example with necessary changes in comments: ```Zig const std = @import("std"); // To get this file: // wget -nc -O war_and_peace.txt https://www.gutenberg.org/ebooks/2600.txt.utf-8 const data = @embedFile("war_and_peace.txt"); pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer std.debug.assert(gpa.deinit() == .ok); const allocator = gpa.allocator(); try oldDeflate(allocator); try new(std.compress.flate, allocator); try oldZlib(allocator); try new(std.compress.zlib, allocator); try oldGzip(allocator); try new(std.compress.gzip, allocator); } pub fn new(comptime pkg: type, allocator: std.mem.Allocator) !void { var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Compressor var cmp = try pkg.compressor(buf.writer(), .{}); _ = try cmp.write(data); try cmp.finish(); var fbs = std.io.fixedBufferStream(buf.items); // Decompressor var dcp = pkg.decompressor(fbs.reader()); const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } pub fn oldDeflate(allocator: std.mem.Allocator) !void { const deflate = std.compress.v1.deflate; // Compressor var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Remove allocator // Rename deflate -> flate var cmp = try deflate.compressor(allocator, buf.writer(), .{}); _ = try cmp.write(data); try cmp.close(); // Rename to finish cmp.deinit(); // Remove // Decompressor var fbs = std.io.fixedBufferStream(buf.items); // Remove allocator and last param // Rename deflate -> flate // Remove try var dcp = try deflate.decompressor(allocator, fbs.reader(), null); defer dcp.deinit(); // Remove const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } pub fn oldZlib(allocator: std.mem.Allocator) !void { const zlib = std.compress.v1.zlib; var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Compressor // Rename compressStream => compressor // Remove allocator var cmp = try zlib.compressStream(allocator, buf.writer(), .{}); _ = try cmp.write(data); try cmp.finish(); cmp.deinit(); // Remove var fbs = std.io.fixedBufferStream(buf.items); // Decompressor // decompressStream => decompressor // Remove allocator // Remove try var dcp = try zlib.decompressStream(allocator, fbs.reader()); defer dcp.deinit(); // Remove const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } pub fn oldGzip(allocator: std.mem.Allocator) !void { const gzip = std.compress.v1.gzip; var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Compressor // Rename compress => compressor // Remove allocator var cmp = try gzip.compress(allocator, buf.writer(), .{}); _ = try cmp.write(data); try cmp.close(); // Rename to finisho cmp.deinit(); // Remove var fbs = std.io.fixedBufferStream(buf.items); // Decompressor // Rename decompress => decompressor // Remove allocator // Remove try var dcp = try gzip.decompress(allocator, fbs.reader()); defer dcp.deinit(); // Remove const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } ```
334 lines
13 KiB
Zig
334 lines
13 KiB
Zig
const std = @import("std");
|
|
const assert = std.debug.assert;
|
|
const testing = std.testing;
|
|
|
|
pub fn bitReader(reader: anytype) BitReader(@TypeOf(reader)) {
|
|
return BitReader(@TypeOf(reader)).init(reader);
|
|
}
|
|
|
|
/// Bit reader used during inflate (decompression). Has internal buffer of 64
|
|
/// bits which shifts right after bits are consumed. Uses forward_reader to fill
|
|
/// that internal buffer when needed.
|
|
///
|
|
/// readF is the core function. Supports few different ways of getting bits
|
|
/// controlled by flags. In hot path we try to avoid checking whether we need to
|
|
/// fill buffer from forward_reader by calling fill in advance and readF with
|
|
/// buffered flag set.
|
|
///
|
|
pub fn BitReader(comptime ReaderType: type) type {
|
|
return struct {
|
|
// Underlying reader used for filling internal bits buffer
|
|
forward_reader: ReaderType = undefined,
|
|
// Internal buffer of 64 bits
|
|
bits: u64 = 0,
|
|
// Number of bits in the buffer
|
|
nbits: u32 = 0,
|
|
|
|
const Self = @This();
|
|
|
|
pub const Error = ReaderType.Error || error{EndOfStream};
|
|
|
|
pub fn init(rdr: ReaderType) Self {
|
|
var self = Self{ .forward_reader = rdr };
|
|
self.fill(1) catch {};
|
|
return self;
|
|
}
|
|
|
|
// Try to have `nice` bits are available in buffer. Reads from
|
|
// forward reader if there is no `nice` bits in buffer. Returns error
|
|
// if end of forward stream is reached and internal buffer is empty.
|
|
// It will not error if less than `nice` bits are in buffer, only when
|
|
// all bits are exhausted. During inflate we usually know what is the
|
|
// maximum bits for the next step but usually that step will need less
|
|
// bits to decode. So `nice` is not hard limit, it will just try to have
|
|
// that number of bits available. If end of forward stream is reached
|
|
// it may be some extra zero bits in buffer.
|
|
pub inline fn fill(self: *Self, nice: u6) !void {
|
|
if (self.nbits >= nice) {
|
|
return; // We have enought bits
|
|
}
|
|
// Read more bits from forward reader
|
|
|
|
// Number of empty bytes in bits, round nbits to whole bytes.
|
|
const empty_bytes =
|
|
@as(u8, if (self.nbits & 0x7 == 0) 8 else 7) - // 8 for 8, 16, 24..., 7 otherwise
|
|
(self.nbits >> 3); // 0 for 0-7, 1 for 8-16, ... same as / 8
|
|
|
|
var buf: [8]u8 = [_]u8{0} ** 8;
|
|
const bytes_read = self.forward_reader.read(buf[0..empty_bytes]) catch 0;
|
|
if (bytes_read > 0) {
|
|
const u: u64 = std.mem.readInt(u64, buf[0..8], .little);
|
|
self.bits |= u << @as(u6, @intCast(self.nbits));
|
|
self.nbits += 8 * @as(u8, @intCast(bytes_read));
|
|
return;
|
|
}
|
|
|
|
if (self.nbits == 0)
|
|
return error.EndOfStream;
|
|
}
|
|
|
|
// Read exactly buf.len bytes into buf.
|
|
pub fn readAll(self: *Self, buf: []u8) !void {
|
|
assert(self.alignBits() == 0); // internal bits must be at byte boundary
|
|
|
|
// First read from internal bits buffer.
|
|
var n: usize = 0;
|
|
while (self.nbits > 0 and n < buf.len) {
|
|
buf[n] = try self.readF(u8, flag.buffered);
|
|
n += 1;
|
|
}
|
|
// Then use forward reader for all other bytes.
|
|
try self.forward_reader.readNoEof(buf[n..]);
|
|
}
|
|
|
|
pub const flag = struct {
|
|
pub const peek: u3 = 0b001; // dont advance internal buffer, just get bits, leave them in buffer
|
|
pub const buffered: u3 = 0b010; // assume that there is no need to fill, fill should be called before
|
|
pub const reverse: u3 = 0b100; // bit reverse readed bits
|
|
};
|
|
|
|
// Alias for readF(U, 0).
|
|
pub fn read(self: *Self, comptime U: type) !U {
|
|
return self.readF(U, 0);
|
|
}
|
|
|
|
// Alias for readF with flag.peak set.
|
|
pub inline fn peekF(self: *Self, comptime U: type, comptime how: u3) !U {
|
|
return self.readF(U, how | flag.peek);
|
|
}
|
|
|
|
// Read with flags provided.
|
|
pub fn readF(self: *Self, comptime U: type, comptime how: u3) !U {
|
|
const n: u6 = @bitSizeOf(U);
|
|
switch (how) {
|
|
0 => { // `normal` read
|
|
try self.fill(n); // ensure that there are n bits in the buffer
|
|
const u: U = @truncate(self.bits); // get n bits
|
|
try self.shift(n); // advance buffer for n
|
|
return u;
|
|
},
|
|
(flag.peek) => { // no shift, leave bits in the buffer
|
|
try self.fill(n);
|
|
return @truncate(self.bits);
|
|
},
|
|
flag.buffered => { // no fill, assume that buffer has enought bits
|
|
const u: U = @truncate(self.bits);
|
|
try self.shift(n);
|
|
return u;
|
|
},
|
|
(flag.reverse) => { // same as 0 with bit reverse
|
|
try self.fill(n);
|
|
const u: U = @truncate(self.bits);
|
|
try self.shift(n);
|
|
return @bitReverse(u);
|
|
},
|
|
(flag.peek | flag.reverse) => {
|
|
try self.fill(n);
|
|
return @bitReverse(@as(U, @truncate(self.bits)));
|
|
},
|
|
(flag.buffered | flag.reverse) => {
|
|
const u: U = @truncate(self.bits);
|
|
try self.shift(n);
|
|
return @bitReverse(u);
|
|
},
|
|
(flag.peek | flag.buffered) => {
|
|
return @truncate(self.bits);
|
|
},
|
|
(flag.peek | flag.buffered | flag.reverse) => {
|
|
return @bitReverse(@as(U, @truncate(self.bits)));
|
|
},
|
|
}
|
|
}
|
|
|
|
// Read n number of bits.
|
|
// Only buffered flag can be used in how.
|
|
pub fn readN(self: *Self, n: u4, comptime how: u3) !u16 {
|
|
switch (how) {
|
|
0 => {
|
|
try self.fill(n);
|
|
},
|
|
flag.buffered => {},
|
|
else => unreachable,
|
|
}
|
|
const mask: u16 = (@as(u16, 1) << n) - 1;
|
|
const u: u16 = @as(u16, @truncate(self.bits)) & mask;
|
|
try self.shift(n);
|
|
return u;
|
|
}
|
|
|
|
// Advance buffer for n bits.
|
|
pub fn shift(self: *Self, n: u6) !void {
|
|
if (n > self.nbits) return error.EndOfStream;
|
|
self.bits >>= n;
|
|
self.nbits -= n;
|
|
}
|
|
|
|
// Skip n bytes.
|
|
pub fn skipBytes(self: *Self, n: u16) !void {
|
|
for (0..n) |_| {
|
|
try self.fill(8);
|
|
try self.shift(8);
|
|
}
|
|
}
|
|
|
|
// Number of bits to align stream to the byte boundary.
|
|
fn alignBits(self: *Self) u3 {
|
|
return @intCast(self.nbits & 0x7);
|
|
}
|
|
|
|
// Align stream to the byte boundary.
|
|
pub fn alignToByte(self: *Self) void {
|
|
const ab = self.alignBits();
|
|
if (ab > 0) self.shift(ab) catch unreachable;
|
|
}
|
|
|
|
// Skip zero terminated string.
|
|
pub fn skipStringZ(self: *Self) !void {
|
|
while (true) {
|
|
if (try self.readF(u8, 0) == 0) break;
|
|
}
|
|
}
|
|
|
|
// Read deflate fixed fixed code.
|
|
// Reads first 7 bits, and then mybe 1 or 2 more to get full 7,8 or 9 bit code.
|
|
// ref: https://datatracker.ietf.org/doc/html/rfc1951#page-12
|
|
// Lit Value Bits Codes
|
|
// --------- ---- -----
|
|
// 0 - 143 8 00110000 through
|
|
// 10111111
|
|
// 144 - 255 9 110010000 through
|
|
// 111111111
|
|
// 256 - 279 7 0000000 through
|
|
// 0010111
|
|
// 280 - 287 8 11000000 through
|
|
// 11000111
|
|
pub fn readFixedCode(self: *Self) !u16 {
|
|
try self.fill(7 + 2);
|
|
const code7 = try self.readF(u7, flag.buffered | flag.reverse);
|
|
if (code7 <= 0b0010_111) { // 7 bits, 256-279, codes 0000_000 - 0010_111
|
|
return @as(u16, code7) + 256;
|
|
} else if (code7 <= 0b1011_111) { // 8 bits, 0-143, codes 0011_0000 through 1011_1111
|
|
return (@as(u16, code7) << 1) + @as(u16, try self.readF(u1, flag.buffered)) - 0b0011_0000;
|
|
} else if (code7 <= 0b1100_011) { // 8 bit, 280-287, codes 1100_0000 - 1100_0111
|
|
return (@as(u16, code7 - 0b1100000) << 1) + try self.readF(u1, flag.buffered) + 280;
|
|
} else { // 9 bit, 144-255, codes 1_1001_0000 - 1_1111_1111
|
|
return (@as(u16, code7 - 0b1100_100) << 2) + @as(u16, try self.readF(u2, flag.buffered | flag.reverse)) + 144;
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
test "flate.BitReader" {
|
|
var fbs = std.io.fixedBufferStream(&[_]u8{ 0xf3, 0x48, 0xcd, 0xc9, 0x00, 0x00 });
|
|
var br = bitReader(fbs.reader());
|
|
const F = BitReader(@TypeOf(fbs.reader())).flag;
|
|
|
|
try testing.expectEqual(@as(u8, 48), br.nbits);
|
|
try testing.expectEqual(@as(u64, 0xc9cd48f3), br.bits);
|
|
|
|
try testing.expect(try br.readF(u1, 0) == 0b0000_0001);
|
|
try testing.expect(try br.readF(u2, 0) == 0b0000_0001);
|
|
try testing.expectEqual(@as(u8, 48 - 3), br.nbits);
|
|
try testing.expectEqual(@as(u3, 5), br.alignBits());
|
|
|
|
try testing.expect(try br.readF(u8, F.peek) == 0b0001_1110);
|
|
try testing.expect(try br.readF(u9, F.peek) == 0b1_0001_1110);
|
|
try br.shift(9);
|
|
try testing.expectEqual(@as(u8, 36), br.nbits);
|
|
try testing.expectEqual(@as(u3, 4), br.alignBits());
|
|
|
|
try testing.expect(try br.readF(u4, 0) == 0b0100);
|
|
try testing.expectEqual(@as(u8, 32), br.nbits);
|
|
try testing.expectEqual(@as(u3, 0), br.alignBits());
|
|
|
|
try br.shift(1);
|
|
try testing.expectEqual(@as(u3, 7), br.alignBits());
|
|
try br.shift(1);
|
|
try testing.expectEqual(@as(u3, 6), br.alignBits());
|
|
br.alignToByte();
|
|
try testing.expectEqual(@as(u3, 0), br.alignBits());
|
|
|
|
try testing.expectEqual(@as(u64, 0xc9), br.bits);
|
|
try testing.expectEqual(@as(u16, 0x9), try br.readN(4, 0));
|
|
try testing.expectEqual(@as(u16, 0xc), try br.readN(4, 0));
|
|
}
|
|
|
|
test "flate.BitReader read block type 1 data" {
|
|
const data = [_]u8{
|
|
0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
|
|
0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
|
|
0x0c, 0x01, 0x02, 0x03, //
|
|
0xaa, 0xbb, 0xcc, 0xdd,
|
|
};
|
|
var fbs = std.io.fixedBufferStream(&data);
|
|
var br = bitReader(fbs.reader());
|
|
const F = BitReader(@TypeOf(fbs.reader())).flag;
|
|
|
|
try testing.expectEqual(@as(u1, 1), try br.readF(u1, 0)); // bfinal
|
|
try testing.expectEqual(@as(u2, 1), try br.readF(u2, 0)); // block_type
|
|
|
|
for ("Hello world\n") |c| {
|
|
try testing.expectEqual(@as(u8, c), try br.readF(u8, F.reverse) - 0x30);
|
|
}
|
|
try testing.expectEqual(@as(u7, 0), try br.readF(u7, 0)); // end of block
|
|
br.alignToByte();
|
|
try testing.expectEqual(@as(u32, 0x0302010c), try br.readF(u32, 0));
|
|
try testing.expectEqual(@as(u16, 0xbbaa), try br.readF(u16, 0));
|
|
try testing.expectEqual(@as(u16, 0xddcc), try br.readF(u16, 0));
|
|
}
|
|
|
|
test "flate.BitReader init" {
|
|
const data = [_]u8{
|
|
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
|
|
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
|
|
};
|
|
var fbs = std.io.fixedBufferStream(&data);
|
|
var br = bitReader(fbs.reader());
|
|
|
|
try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits);
|
|
try br.shift(8);
|
|
try testing.expectEqual(@as(u64, 0x00_08_07_06_05_04_03_02), br.bits);
|
|
try br.fill(60); // fill with 1 byte
|
|
try testing.expectEqual(@as(u64, 0x01_08_07_06_05_04_03_02), br.bits);
|
|
try br.shift(8 * 4 + 4);
|
|
try testing.expectEqual(@as(u64, 0x00_00_00_00_00_10_80_70), br.bits);
|
|
|
|
try br.fill(60); // fill with 4 bytes (shift by 4)
|
|
try testing.expectEqual(@as(u64, 0x00_50_40_30_20_10_80_70), br.bits);
|
|
try testing.expectEqual(@as(u8, 8 * 7 + 4), br.nbits);
|
|
|
|
try br.shift(@intCast(br.nbits)); // clear buffer
|
|
try br.fill(8); // refill with the rest of the bytes
|
|
try testing.expectEqual(@as(u64, 0x00_00_00_00_00_08_07_06), br.bits);
|
|
}
|
|
|
|
test "flate.BitReader readAll" {
|
|
const data = [_]u8{
|
|
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
|
|
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
|
|
};
|
|
var fbs = std.io.fixedBufferStream(&data);
|
|
var br = bitReader(fbs.reader());
|
|
|
|
try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits);
|
|
|
|
var out: [16]u8 = undefined;
|
|
try br.readAll(out[0..]);
|
|
try testing.expect(br.nbits == 0);
|
|
try testing.expect(br.bits == 0);
|
|
|
|
try testing.expectEqualSlices(u8, data[0..16], &out);
|
|
}
|
|
|
|
test "flate.BitReader readFixedCode" {
|
|
const fixed_codes = @import("huffman_encoder.zig").fixed_codes;
|
|
|
|
var fbs = std.io.fixedBufferStream(&fixed_codes);
|
|
var rdr = bitReader(fbs.reader());
|
|
|
|
for (0..286) |c| {
|
|
try testing.expectEqual(c, try rdr.readFixedCode());
|
|
}
|
|
try testing.expect(rdr.nbits == 0);
|
|
}
|