zig/lib/std/compress/flate/container.zig
Igor Anić d645114f7e add deflate implemented from first principles
Zig deflate compression/decompression implementation. It supports compression and decompression of gzip, zlib and raw deflate format.

Fixes #18062.

This PR replaces current compress/gzip and compress/zlib packages. Deflate package is renamed to flate. Flate is common name for deflate/inflate where deflate is compression and inflate decompression.

There are breaking change. Methods signatures are changed because of removal of the allocator, and I also unified API for all three namespaces (flate, gzip, zlib).

Currently I put old packages under v1 namespace they are still available as compress/v1/gzip, compress/v1/zlib, compress/v1/deflate. Idea is to give users of the current API little time to postpone analyzing what they had to change. Although that rises question when it is safe to remove that v1 namespace.

Here is current API in the compress package:

```Zig
// deflate
    fn compressor(allocator, writer, options) !Compressor(@TypeOf(writer))
    fn Compressor(comptime WriterType) type

    fn decompressor(allocator, reader, null) !Decompressor(@TypeOf(reader))
    fn Decompressor(comptime ReaderType: type) type

// gzip
    fn compress(allocator, writer, options) !Compress(@TypeOf(writer))
    fn Compress(comptime WriterType: type) type

    fn decompress(allocator, reader) !Decompress(@TypeOf(reader))
    fn Decompress(comptime ReaderType: type) type

// zlib
    fn compressStream(allocator, writer, options) !CompressStream(@TypeOf(writer))
    fn CompressStream(comptime WriterType: type) type

    fn decompressStream(allocator, reader) !DecompressStream(@TypeOf(reader))
    fn DecompressStream(comptime ReaderType: type) type

// xz
   fn decompress(allocator: Allocator, reader: anytype) !Decompress(@TypeOf(reader))
   fn Decompress(comptime ReaderType: type) type

// lzma
    fn decompress(allocator, reader) !Decompress(@TypeOf(reader))
    fn Decompress(comptime ReaderType: type) type

// lzma2
    fn decompress(allocator, reader, writer !void

// zstandard:
    fn DecompressStream(ReaderType, options) type
    fn decompressStream(allocator, reader) DecompressStream(@TypeOf(reader), .{})
    struct decompress
```

The proposed naming convention:
 - Compressor/Decompressor for functions which return type, like Reader/Writer/GeneralPurposeAllocator
 - compressor/compressor for functions which are initializers for that type, like reader/writer/allocator
 - compress/decompress for one shot operations, accepts reader/writer pair, like read/write/alloc

```Zig
/// Compress from reader and write compressed data to the writer.
fn compress(reader: anytype, writer: anytype, options: Options) !void

/// Create Compressor which outputs the writer.
fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer))

/// Compressor type
fn Compressor(comptime WriterType: type) type

/// Decompress from reader and write plain data to the writer.
fn decompress(reader: anytype, writer: anytype) !void

/// Create Decompressor which reads from reader.
fn decompressor(reader: anytype) Decompressor(@TypeOf(reader)

/// Decompressor type
fn Decompressor(comptime ReaderType: type) type

```

Comparing this implementation with the one we currently have in Zig's standard library (std).
Std is roughly 1.2-1.4 times slower in decompression, and 1.1-1.2 times slower in compression. Compressed sizes are pretty much same in both cases.
More resutls in [this](https://github.com/ianic/flate) repo.

This library uses static allocations for all structures, doesn't require allocator. That makes sense especially for deflate where all structures, internal buffers are allocated to the full size. Little less for inflate where we std version uses less memory by not preallocating to theoretical max size array which are usually not fully used.

For deflate this library allocates 395K while std 779K.
For inflate this library allocates 74.5K while std around 36K.

Inflate difference is because we here use 64K history instead of 32K in std.

If merged existing usage of compress gzip/zlib/deflate need some changes. Here is example with necessary changes in comments:

```Zig

const std = @import("std");

// To get this file:
// wget -nc -O war_and_peace.txt https://www.gutenberg.org/ebooks/2600.txt.utf-8
const data = @embedFile("war_and_peace.txt");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer std.debug.assert(gpa.deinit() == .ok);
    const allocator = gpa.allocator();

    try oldDeflate(allocator);
    try new(std.compress.flate, allocator);

    try oldZlib(allocator);
    try new(std.compress.zlib, allocator);

    try oldGzip(allocator);
    try new(std.compress.gzip, allocator);
}

pub fn new(comptime pkg: type, allocator: std.mem.Allocator) !void {
    var buf = std.ArrayList(u8).init(allocator);
    defer buf.deinit();

    // Compressor
    var cmp = try pkg.compressor(buf.writer(), .{});
    _ = try cmp.write(data);
    try cmp.finish();

    var fbs = std.io.fixedBufferStream(buf.items);
    // Decompressor
    var dcp = pkg.decompressor(fbs.reader());

    const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize));
    defer allocator.free(plain);
    try std.testing.expectEqualSlices(u8, data, plain);
}

pub fn oldDeflate(allocator: std.mem.Allocator) !void {
    const deflate = std.compress.v1.deflate;

    // Compressor
    var buf = std.ArrayList(u8).init(allocator);
    defer buf.deinit();
    // Remove allocator
    // Rename deflate -> flate
    var cmp = try deflate.compressor(allocator, buf.writer(), .{});
    _ = try cmp.write(data);
    try cmp.close(); // Rename to finish
    cmp.deinit(); // Remove

    // Decompressor
    var fbs = std.io.fixedBufferStream(buf.items);
    // Remove allocator and last param
    // Rename deflate -> flate
    // Remove try
    var dcp = try deflate.decompressor(allocator, fbs.reader(), null);
    defer dcp.deinit(); // Remove

    const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize));
    defer allocator.free(plain);
    try std.testing.expectEqualSlices(u8, data, plain);
}

pub fn oldZlib(allocator: std.mem.Allocator) !void {
    const zlib = std.compress.v1.zlib;

    var buf = std.ArrayList(u8).init(allocator);
    defer buf.deinit();

    // Compressor
    // Rename compressStream => compressor
    // Remove allocator
    var cmp = try zlib.compressStream(allocator, buf.writer(), .{});
    _ = try cmp.write(data);
    try cmp.finish();
    cmp.deinit(); // Remove

    var fbs = std.io.fixedBufferStream(buf.items);
    // Decompressor
    // decompressStream => decompressor
    // Remove allocator
    // Remove try
    var dcp = try zlib.decompressStream(allocator, fbs.reader());
    defer dcp.deinit(); // Remove

    const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize));
    defer allocator.free(plain);
    try std.testing.expectEqualSlices(u8, data, plain);
}

pub fn oldGzip(allocator: std.mem.Allocator) !void {
    const gzip = std.compress.v1.gzip;

    var buf = std.ArrayList(u8).init(allocator);
    defer buf.deinit();

    // Compressor
    // Rename compress => compressor
    // Remove allocator
    var cmp = try gzip.compress(allocator, buf.writer(), .{});
    _ = try cmp.write(data);
    try cmp.close(); // Rename to finisho
    cmp.deinit(); // Remove

    var fbs = std.io.fixedBufferStream(buf.items);
    // Decompressor
    // Rename decompress => decompressor
    // Remove allocator
    // Remove try
    var dcp = try gzip.decompress(allocator, fbs.reader());
    defer dcp.deinit(); // Remove

    const plain = try dcp.reader().readAllAlloc(allocator, std.math.maxInt(usize));
    defer allocator.free(plain);
    try std.testing.expectEqualSlices(u8, data, plain);
}

```
2024-02-14 18:28:20 +01:00

206 lines
7.2 KiB
Zig

const std = @import("std");
/// Container of the deflate bit stream body. Container adds header before
/// deflate bit stream and footer after. It can bi gzip, zlib or raw (no header,
/// no footer, raw bit stream).
///
/// Zlib format is defined in rfc 1950. Header has 2 bytes and footer 4 bytes
/// addler 32 checksum.
///
/// Gzip format is defined in rfc 1952. Header has 10+ bytes and footer 4 bytes
/// crc32 checksum and 4 bytes of uncompressed data length.
///
///
/// rfc 1950: https://datatracker.ietf.org/doc/html/rfc1950#page-4
/// rfc 1952: https://datatracker.ietf.org/doc/html/rfc1952#page-5
///
pub const Container = enum {
raw, // no header or footer
gzip, // gzip header and footer
zlib, // zlib header and footer
pub fn size(w: Container) usize {
return headerSize(w) + footerSize(w);
}
pub fn headerSize(w: Container) usize {
return switch (w) {
.gzip => 10,
.zlib => 2,
.raw => 0,
};
}
pub fn footerSize(w: Container) usize {
return switch (w) {
.gzip => 8,
.zlib => 4,
.raw => 0,
};
}
pub const list = [_]Container{ .raw, .gzip, .zlib };
pub const Error = error{
BadGzipHeader,
BadZlibHeader,
WrongGzipChecksum,
WrongGzipSize,
WrongZlibChecksum,
};
pub fn writeHeader(comptime wrap: Container, writer: anytype) !void {
switch (wrap) {
.gzip => {
// GZIP 10 byte header (https://datatracker.ietf.org/doc/html/rfc1952#page-5):
// - ID1 (IDentification 1), always 0x1f
// - ID2 (IDentification 2), always 0x8b
// - CM (Compression Method), always 8 = deflate
// - FLG (Flags), all set to 0
// - 4 bytes, MTIME (Modification time), not used, all set to zero
// - XFL (eXtra FLags), all set to zero
// - OS (Operating System), 03 = Unix
const gzipHeader = [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 };
try writer.writeAll(&gzipHeader);
},
.zlib => {
// ZLIB has a two-byte header (https://datatracker.ietf.org/doc/html/rfc1950#page-4):
// 1st byte:
// - First four bits is the CINFO (compression info), which is 7 for the default deflate window size.
// - The next four bits is the CM (compression method), which is 8 for deflate.
// 2nd byte:
// - Two bits is the FLEVEL (compression level). Values are: 0=fastest, 1=fast, 2=default, 3=best.
// - The next bit, FDICT, is set if a dictionary is given.
// - The final five FCHECK bits form a mod-31 checksum.
//
// CINFO = 7, CM = 8, FLEVEL = 0b10, FDICT = 0, FCHECK = 0b11100
const zlibHeader = [_]u8{ 0x78, 0b10_0_11100 };
try writer.writeAll(&zlibHeader);
},
.raw => {},
}
}
pub fn writeFooter(comptime wrap: Container, hasher: *Hasher(wrap), writer: anytype) !void {
var bits: [4]u8 = undefined;
switch (wrap) {
.gzip => {
// GZIP 8 bytes footer
// - 4 bytes, CRC32 (CRC-32)
// - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32
std.mem.writeInt(u32, &bits, hasher.chksum(), .little);
try writer.writeAll(&bits);
std.mem.writeInt(u32, &bits, hasher.bytesRead(), .little);
try writer.writeAll(&bits);
},
.zlib => {
// ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
// 4 bytes of ADLER32 (Adler-32 checksum)
// Checksum value of the uncompressed data (excluding any
// dictionary data) computed according to Adler-32
// algorithm.
std.mem.writeInt(u32, &bits, hasher.chksum(), .big);
try writer.writeAll(&bits);
},
.raw => {},
}
}
pub fn parseHeader(comptime wrap: Container, reader: anytype) !void {
switch (wrap) {
.gzip => try parseGzipHeader(reader),
.zlib => try parseZlibHeader(reader),
.raw => {},
}
}
fn parseGzipHeader(reader: anytype) !void {
const magic1 = try reader.read(u8);
const magic2 = try reader.read(u8);
const method = try reader.read(u8);
const flags = try reader.read(u8);
try reader.skipBytes(6); // mtime(4), xflags, os
if (magic1 != 0x1f or magic2 != 0x8b or method != 0x08)
return error.BadGzipHeader;
// Flags description: https://www.rfc-editor.org/rfc/rfc1952.html#page-5
if (flags != 0) {
if (flags & 0b0000_0100 != 0) { // FEXTRA
const extra_len = try reader.read(u16);
try reader.skipBytes(extra_len);
}
if (flags & 0b0000_1000 != 0) { // FNAME
try reader.skipStringZ();
}
if (flags & 0b0001_0000 != 0) { // FCOMMENT
try reader.skipStringZ();
}
if (flags & 0b0000_0010 != 0) { // FHCRC
try reader.skipBytes(2);
}
}
}
fn parseZlibHeader(reader: anytype) !void {
const cinfo_cm = try reader.read(u8);
_ = try reader.read(u8);
if (cinfo_cm != 0x78) {
return error.BadZlibHeader;
}
}
pub fn parseFooter(comptime wrap: Container, hasher: *Hasher(wrap), reader: anytype) !void {
switch (wrap) {
.gzip => {
if (try reader.read(u32) != hasher.chksum()) return error.WrongGzipChecksum;
if (try reader.read(u32) != hasher.bytesRead()) return error.WrongGzipSize;
},
.zlib => {
const chksum: u32 = @byteSwap(hasher.chksum());
if (try reader.read(u32) != chksum) return error.WrongZlibChecksum;
},
.raw => {},
}
}
pub fn Hasher(comptime wrap: Container) type {
const HasherType = switch (wrap) {
.gzip => std.hash.Crc32,
.zlib => std.hash.Adler32,
.raw => struct {
pub fn init() @This() {
return .{};
}
},
};
return struct {
hasher: HasherType = HasherType.init(),
bytes: usize = 0,
const Self = @This();
pub fn update(self: *Self, buf: []const u8) void {
switch (wrap) {
.raw => {},
else => {
self.hasher.update(buf);
self.bytes += buf.len;
},
}
}
pub fn chksum(self: *Self) u32 {
switch (wrap) {
.raw => return 0,
else => return self.hasher.final(),
}
}
pub fn bytesRead(self: *Self) u32 {
return @truncate(self.bytes);
}
};
}
};