mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 06:13:07 +00:00
Zig deflate compression/decompression implementation. It supports compression and decompression of gzip, zlib and raw deflate format. Fixes #18062. This PR replaces current compress/gzip and compress/zlib packages. Deflate package is renamed to flate. Flate is common name for deflate/inflate where deflate is compression and inflate decompression. There are breaking change. Methods signatures are changed because of removal of the allocator, and I also unified API for all three namespaces (flate, gzip, zlib). Currently I put old packages under v1 namespace they are still available as compress/v1/gzip, compress/v1/zlib, compress/v1/deflate. Idea is to give users of the current API little time to postpone analyzing what they had to change. Although that rises question when it is safe to remove that v1 namespace. Here is current API in the compress package: ```Zig // deflate fn compressor(allocator, writer, options) !Compressor(@TypeOf(writer)) fn Compressor(comptime WriterType) type fn decompressor(allocator, reader, null) !Decompressor(@TypeOf(reader)) fn Decompressor(comptime ReaderType: type) type // gzip fn compress(allocator, writer, options) !Compress(@TypeOf(writer)) fn Compress(comptime WriterType: type) type fn decompress(allocator, reader) !Decompress(@TypeOf(reader)) fn Decompress(comptime ReaderType: type) type // zlib fn compressStream(allocator, writer, options) !CompressStream(@TypeOf(writer)) fn CompressStream(comptime WriterType: type) type fn decompressStream(allocator, reader) !DecompressStream(@TypeOf(reader)) fn DecompressStream(comptime ReaderType: type) type // xz fn decompress(allocator: Allocator, reader: anytype) !Decompress(@TypeOf(reader)) fn Decompress(comptime ReaderType: type) type // lzma fn decompress(allocator, reader) !Decompress(@TypeOf(reader)) fn Decompress(comptime ReaderType: type) type // lzma2 fn decompress(allocator, reader, writer !void // zstandard: fn DecompressStream(ReaderType, options) type fn decompressStream(allocator, reader) DecompressStream(@TypeOf(reader), .{}) struct decompress ``` The proposed naming convention: - Compressor/Decompressor for functions which return type, like Reader/Writer/GeneralPurposeAllocator - compressor/compressor for functions which are initializers for that type, like reader/writer/allocator - compress/decompress for one shot operations, accepts reader/writer pair, like read/write/alloc ```Zig /// Compress from reader and write compressed data to the writer. fn compress(reader: anytype, writer: anytype, options: Options) !void /// Create Compressor which outputs the writer. fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) /// Compressor type fn Compressor(comptime WriterType: type) type /// Decompress from reader and write plain data to the writer. fn decompress(reader: anytype, writer: anytype) !void /// Create Decompressor which reads from reader. fn decompressor(reader: anytype) Decompressor(@TypeOf(reader) /// Decompressor type fn Decompressor(comptime ReaderType: type) type ``` Comparing this implementation with the one we currently have in Zig's standard library (std). Std is roughly 1.2-1.4 times slower in decompression, and 1.1-1.2 times slower in compression. Compressed sizes are pretty much same in both cases. More resutls in [this](https://github.com/ianic/flate) repo. This library uses static allocations for all structures, doesn't require allocator. That makes sense especially for deflate where all structures, internal buffers are allocated to the full size. Little less for inflate where we std version uses less memory by not preallocating to theoretical max size array which are usually not fully used. For deflate this library allocates 395K while std 779K. For inflate this library allocates 74.5K while std around 36K. Inflate difference is because we here use 64K history instead of 32K in std. If merged existing usage of compress gzip/zlib/deflate need some changes. Here is example with necessary changes in comments: ```Zig const std = @import("std"); // To get this file: // wget -nc -O war_and_peace.txt https://www.gutenberg.org/ebooks/2600.txt.utf-8 const data = @embedFile("war_and_peace.txt"); pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer std.debug.assert(gpa.deinit() == .ok); const allocator = gpa.allocator(); try oldDeflate(allocator); try new(std.compress.flate, allocator); try oldZlib(allocator); try new(std.compress.zlib, allocator); try oldGzip(allocator); try new(std.compress.gzip, allocator); } pub fn new(comptime pkg: type, allocator: std.mem.Allocator) !void { var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Compressor var cmp = try pkg.compressor(buf.writer(), .{}); _ = try cmp.write(data); try cmp.finish(); var fbs = std.io.fixedBufferStream(buf.items); // Decompressor var dcp = pkg.decompressor(fbs.reader()); const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } pub fn oldDeflate(allocator: std.mem.Allocator) !void { const deflate = std.compress.v1.deflate; // Compressor var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Remove allocator // Rename deflate -> flate var cmp = try deflate.compressor(allocator, buf.writer(), .{}); _ = try cmp.write(data); try cmp.close(); // Rename to finish cmp.deinit(); // Remove // Decompressor var fbs = std.io.fixedBufferStream(buf.items); // Remove allocator and last param // Rename deflate -> flate // Remove try var dcp = try deflate.decompressor(allocator, fbs.reader(), null); defer dcp.deinit(); // Remove const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } pub fn oldZlib(allocator: std.mem.Allocator) !void { const zlib = std.compress.v1.zlib; var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Compressor // Rename compressStream => compressor // Remove allocator var cmp = try zlib.compressStream(allocator, buf.writer(), .{}); _ = try cmp.write(data); try cmp.finish(); cmp.deinit(); // Remove var fbs = std.io.fixedBufferStream(buf.items); // Decompressor // decompressStream => decompressor // Remove allocator // Remove try var dcp = try zlib.decompressStream(allocator, fbs.reader()); defer dcp.deinit(); // Remove const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } pub fn oldGzip(allocator: std.mem.Allocator) !void { const gzip = std.compress.v1.gzip; var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); // Compressor // Rename compress => compressor // Remove allocator var cmp = try gzip.compress(allocator, buf.writer(), .{}); _ = try cmp.write(data); try cmp.close(); // Rename to finisho cmp.deinit(); // Remove var fbs = std.io.fixedBufferStream(buf.items); // Decompressor // Rename decompress => decompressor // Remove allocator // Remove try var dcp = try gzip.decompress(allocator, fbs.reader()); defer dcp.deinit(); // Remove const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize)); defer allocator.free(plain); try std.testing.expectEqualSlices(u8, data, plain); } ```
235 lines
7.1 KiB
Zig
235 lines
7.1 KiB
Zig
/// 64K buffer of uncompressed data created in inflate (decompression). Has enough
|
|
/// history to support writing match<length, distance>; copying length of bytes
|
|
/// from the position distance backward from current.
|
|
///
|
|
/// Reads can return less than available bytes if they are spread across
|
|
/// different circles. So reads should repeat until get required number of bytes
|
|
/// or until returned slice is zero length.
|
|
///
|
|
/// Note on deflate limits:
|
|
/// * non-compressible block is limited to 65,535 bytes.
|
|
/// * backward pointer is limited in distance to 32K bytes and in length to 258 bytes.
|
|
///
|
|
/// Whole non-compressed block can be written without overlap. We always have
|
|
/// history of up to 64K, more then 32K needed.
|
|
///
|
|
const std = @import("std");
|
|
const assert = std.debug.assert;
|
|
const testing = std.testing;
|
|
|
|
const consts = @import("consts.zig").match;
|
|
|
|
const mask = 0xffff; // 64K - 1
|
|
const buffer_len = mask + 1; // 64K buffer
|
|
|
|
const Self = @This();
|
|
|
|
buffer: [buffer_len]u8 = undefined,
|
|
wp: usize = 0, // write position
|
|
rp: usize = 0, // read position
|
|
|
|
fn writeAll(self: *Self, buf: []const u8) void {
|
|
for (buf) |c| self.write(c);
|
|
}
|
|
|
|
// Write literal.
|
|
pub fn write(self: *Self, b: u8) void {
|
|
assert(self.wp - self.rp < mask);
|
|
self.buffer[self.wp & mask] = b;
|
|
self.wp += 1;
|
|
}
|
|
|
|
// Write match (back-reference to the same data slice) starting at `distance`
|
|
// back from current write position, and `length` of bytes.
|
|
pub fn writeMatch(self: *Self, length: u16, distance: u16) !void {
|
|
if (self.wp < distance or
|
|
length < consts.base_length or length > consts.max_length or
|
|
distance < consts.min_distance or distance > consts.max_distance)
|
|
{
|
|
return error.InvalidMatch;
|
|
}
|
|
assert(self.wp - self.rp < mask);
|
|
|
|
var from: usize = self.wp - distance;
|
|
const from_end: usize = from + length;
|
|
var to: usize = self.wp;
|
|
const to_end: usize = to + length;
|
|
|
|
self.wp += length;
|
|
|
|
// Fast path using memcpy
|
|
if (length <= distance and // no overlapping buffers
|
|
(from >> 16 == from_end >> 16) and // start and and at the same circle
|
|
(to >> 16 == to_end >> 16))
|
|
{
|
|
@memcpy(self.buffer[to & mask .. to_end & mask], self.buffer[from & mask .. from_end & mask]);
|
|
return;
|
|
}
|
|
|
|
// Slow byte by byte
|
|
while (to < to_end) {
|
|
self.buffer[to & mask] = self.buffer[from & mask];
|
|
to += 1;
|
|
from += 1;
|
|
}
|
|
}
|
|
|
|
// Returns writable part of the internal buffer of size `n` at most. Advances
|
|
// write pointer, assumes that returned buffer will be filled with data.
|
|
pub fn getWritable(self: *Self, n: usize) []u8 {
|
|
const wp = self.wp & mask;
|
|
const len = @min(n, buffer_len - wp);
|
|
self.wp += len;
|
|
return self.buffer[wp .. wp + len];
|
|
}
|
|
|
|
// Read available data. Can return part of the available data if it is
|
|
// spread across two circles. So read until this returns zero length.
|
|
pub fn read(self: *Self) []const u8 {
|
|
return self.readAtMost(buffer_len);
|
|
}
|
|
|
|
// Read part of available data. Can return less than max even if there are
|
|
// more than max decoded data.
|
|
pub fn readAtMost(self: *Self, limit: usize) []const u8 {
|
|
const rb = self.readBlock(if (limit == 0) buffer_len else limit);
|
|
defer self.rp += rb.len;
|
|
return self.buffer[rb.head..rb.tail];
|
|
}
|
|
|
|
const ReadBlock = struct {
|
|
head: usize,
|
|
tail: usize,
|
|
len: usize,
|
|
};
|
|
|
|
// Returns position of continous read block data.
|
|
fn readBlock(self: *Self, max: usize) ReadBlock {
|
|
const r = self.rp & mask;
|
|
const w = self.wp & mask;
|
|
const n = @min(
|
|
max,
|
|
if (w >= r) w - r else buffer_len - r,
|
|
);
|
|
return .{
|
|
.head = r,
|
|
.tail = r + n,
|
|
.len = n,
|
|
};
|
|
}
|
|
|
|
// Number of free bytes for write.
|
|
pub fn free(self: *Self) usize {
|
|
return buffer_len - (self.wp - self.rp);
|
|
}
|
|
|
|
// Full if largest match can't fit. 258 is largest match length. That much bytes
|
|
// can be produced in single decode step.
|
|
pub fn full(self: *Self) bool {
|
|
return self.free() < 258 + 1;
|
|
}
|
|
|
|
// example from: https://youtu.be/SJPvNi4HrWQ?t=3558
|
|
test "flate.CircularBuffer writeMatch" {
|
|
var cb: Self = .{};
|
|
|
|
cb.writeAll("a salad; ");
|
|
try cb.writeMatch(5, 9);
|
|
try cb.writeMatch(3, 3);
|
|
|
|
try testing.expectEqualStrings("a salad; a salsal", cb.read());
|
|
}
|
|
|
|
test "flate.CircularBuffer writeMatch overlap" {
|
|
var cb: Self = .{};
|
|
|
|
cb.writeAll("a b c ");
|
|
try cb.writeMatch(8, 4);
|
|
cb.write('d');
|
|
|
|
try testing.expectEqualStrings("a b c b c b c d", cb.read());
|
|
}
|
|
|
|
test "flate.CircularBuffer readAtMost" {
|
|
var cb: Self = .{};
|
|
|
|
cb.writeAll("0123456789");
|
|
try cb.writeMatch(50, 10);
|
|
|
|
try testing.expectEqualStrings("0123456789" ** 6, cb.buffer[cb.rp..cb.wp]);
|
|
for (0..6) |i| {
|
|
try testing.expectEqual(i * 10, cb.rp);
|
|
try testing.expectEqualStrings("0123456789", cb.readAtMost(10));
|
|
}
|
|
try testing.expectEqualStrings("", cb.readAtMost(10));
|
|
try testing.expectEqualStrings("", cb.read());
|
|
}
|
|
|
|
test "flate.CircularBuffer" {
|
|
var cb: Self = .{};
|
|
|
|
const data = "0123456789abcdef" ** (1024 / 16);
|
|
cb.writeAll(data);
|
|
try testing.expectEqual(@as(usize, 0), cb.rp);
|
|
try testing.expectEqual(@as(usize, 1024), cb.wp);
|
|
try testing.expectEqual(@as(usize, 1024 * 63), cb.free());
|
|
|
|
for (0..62 * 4) |_|
|
|
try cb.writeMatch(256, 1024); // write 62K
|
|
|
|
try testing.expectEqual(@as(usize, 0), cb.rp);
|
|
try testing.expectEqual(@as(usize, 63 * 1024), cb.wp);
|
|
try testing.expectEqual(@as(usize, 1024), cb.free());
|
|
|
|
cb.writeAll(data[0..200]);
|
|
_ = cb.readAtMost(1024); // make some space
|
|
cb.writeAll(data); // overflows write position
|
|
try testing.expectEqual(@as(usize, 200 + 65536), cb.wp);
|
|
try testing.expectEqual(@as(usize, 1024), cb.rp);
|
|
try testing.expectEqual(@as(usize, 1024 - 200), cb.free());
|
|
|
|
const rb = cb.readBlock(Self.buffer_len);
|
|
try testing.expectEqual(@as(usize, 65536 - 1024), rb.len);
|
|
try testing.expectEqual(@as(usize, 1024), rb.head);
|
|
try testing.expectEqual(@as(usize, 65536), rb.tail);
|
|
|
|
try testing.expectEqual(@as(usize, 65536 - 1024), cb.read().len); // read to the end of the buffer
|
|
try testing.expectEqual(@as(usize, 200 + 65536), cb.wp);
|
|
try testing.expectEqual(@as(usize, 65536), cb.rp);
|
|
try testing.expectEqual(@as(usize, 65536 - 200), cb.free());
|
|
|
|
try testing.expectEqual(@as(usize, 200), cb.read().len); // read the rest
|
|
}
|
|
|
|
test "flate.CircularBuffer write overlap" {
|
|
var cb: Self = .{};
|
|
cb.wp = cb.buffer.len - 15;
|
|
cb.rp = cb.wp;
|
|
|
|
cb.writeAll("0123456789");
|
|
cb.writeAll("abcdefghij");
|
|
|
|
try testing.expectEqual(cb.buffer.len + 5, cb.wp);
|
|
try testing.expectEqual(cb.buffer.len - 15, cb.rp);
|
|
|
|
try testing.expectEqualStrings("0123456789abcde", cb.read());
|
|
try testing.expectEqualStrings("fghij", cb.read());
|
|
|
|
try testing.expect(cb.wp == cb.rp);
|
|
}
|
|
|
|
test "flate.CircularBuffer writeMatch/read overlap" {
|
|
var cb: Self = .{};
|
|
cb.wp = cb.buffer.len - 15;
|
|
cb.rp = cb.wp;
|
|
|
|
cb.writeAll("0123456789");
|
|
try cb.writeMatch(15, 5);
|
|
|
|
try testing.expectEqualStrings("012345678956789", cb.read());
|
|
try testing.expectEqualStrings("5678956789", cb.read());
|
|
|
|
try cb.writeMatch(20, 25);
|
|
try testing.expectEqualStrings("01234567895678956789", cb.read());
|
|
}
|