zig/lib/std/compress/gzip.zig

//
// Compressor/Decompressor for GZIP data streams (RFC1952)

const std = @import("../std.zig");
const io = std.io;
const fs = std.fs;
const testing = std.testing;
const mem = std.mem;
const deflate = @import("deflate.zig");

const magic = &[2]u8{ 0x1f, 0x8b };

// Flags for the FLG field in the header
const FTEXT = 1 << 0;
const FHCRC = 1 << 1;
const FEXTRA = 1 << 2;
const FNAME = 1 << 3;
const FCOMMENT = 1 << 4;

const max_string_len = 1024;

pub const Header = struct {
    extra: ?[]const u8 = null,
    filename: ?[]const u8 = null,
    comment: ?[]const u8 = null,
    modification_time: u32 = 0,
    operating_system: u8 = 255,
};

pub fn Decompress(comptime ReaderType: type) type {
    return struct {
        const Self = @This();

        pub const Error = ReaderType.Error ||
            deflate.Decompressor(ReaderType).Error ||
            error{ CorruptedData, WrongChecksum };
        pub const Reader = io.Reader(*Self, Error, read);

        allocator: mem.Allocator,
        inflater: deflate.Decompressor(ReaderType),
        in_reader: ReaderType,
        hasher: std.hash.Crc32,
        read_amt: u32,

        info: Header,

        fn init(allocator: mem.Allocator, in_reader: ReaderType) !Self {
            var hasher = std.compress.hashedReader(in_reader, std.hash.Crc32.init());
            const hashed_reader = hasher.reader();

            // gzip header format is specified in RFC1952
            const header = try hashed_reader.readBytesNoEof(10);

            // Check the ID1/ID2 fields
            if (!std.mem.eql(u8, header[0..2], magic))
                return error.BadHeader;

            const CM = header[2];
            // The CM field must be 8 to indicate the use of DEFLATE
            if (CM != 8) return error.InvalidCompression;
            // Flags
            const FLG = header[3];
            // Modification time, as a Unix timestamp.
            // If zero there's no timestamp available.
            const MTIME = mem.readInt(u32, header[4..8], .little);
            // Extra flags
            const XFL = header[8];
            // Operating system where the compression took place
            const OS = header[9];
            _ = XFL;

            const extra = if (FLG & FEXTRA != 0) blk: {
                const len = try hashed_reader.readInt(u16, .little);
                const tmp_buf = try allocator.alloc(u8, len);
                errdefer allocator.free(tmp_buf);

                try hashed_reader.readNoEof(tmp_buf);
                break :blk tmp_buf;
            } else null;
            errdefer if (extra) |p| allocator.free(p);

            const filename = if (FLG & FNAME != 0)
                try hashed_reader.readUntilDelimiterAlloc(allocator, 0, max_string_len)
            else
                null;
            errdefer if (filename) |p| allocator.free(p);

            const comment = if (FLG & FCOMMENT != 0)
                try hashed_reader.readUntilDelimiterAlloc(allocator, 0, max_string_len)
            else
                null;
            errdefer if (comment) |p| allocator.free(p);

            if (FLG & FHCRC != 0) {
                const hash = try in_reader.readInt(u16, .little);
                if (hash != @as(u16, @truncate(hasher.hasher.final())))
                    return error.WrongChecksum;
            }

            return .{
                .allocator = allocator,
                .inflater = try deflate.decompressor(allocator, in_reader, null),
                .in_reader = in_reader,
                .hasher = std.hash.Crc32.init(),
                .info = .{
                    .filename = filename,
                    .comment = comment,
                    .extra = extra,
                    .modification_time = MTIME,
                    .operating_system = OS,
                },
                .read_amt = 0,
            };
        }

        pub fn deinit(self: *Self) void {
            self.inflater.deinit();
            if (self.info.extra) |extra|
                self.allocator.free(extra);
            if (self.info.filename) |filename|
                self.allocator.free(filename);
            if (self.info.comment) |comment|
                self.allocator.free(comment);
        }

        /// Implements the io.Reader interface
        pub fn read(self: *Self, buffer: []u8) Error!usize {
            if (buffer.len == 0)
                return 0;

            // Read from the compressed stream and update the computed checksum
            const r = try self.inflater.read(buffer);
            if (r != 0) {
                self.hasher.update(buffer[0..r]);
                self.read_amt +%= @truncate(r);
                return r;
            }

            try self.inflater.close();

            // We've reached the end of stream, check if the checksum matches
            const hash = try self.in_reader.readInt(u32, .little);
            if (hash != self.hasher.final())
                return error.WrongChecksum;

            // The ISIZE field is the size of the uncompressed input modulo 2^32
            const input_size = try self.in_reader.readInt(u32, .little);
            if (self.read_amt != input_size)
                return error.CorruptedData;

            return 0;
        }

        pub fn reader(self: *Self) Reader {
            return .{ .context = self };
        }
    };
}

pub fn decompress(allocator: mem.Allocator, reader: anytype) !Decompress(@TypeOf(reader)) {
    return Decompress(@TypeOf(reader)).init(allocator, reader);
}

pub const CompressOptions = struct {
    header: Header = .{},
    hash_header: bool = true,
    level: deflate.Compression = .default_compression,
};

pub fn Compress(comptime WriterType: type) type {
    return struct {
        const Self = @This();

        pub const Error = WriterType.Error ||
            deflate.Compressor(WriterType).Error;
        pub const Writer = io.Writer(*Self, Error, write);

        allocator: mem.Allocator,
        deflater: deflate.Compressor(WriterType),
        out_writer: WriterType,
        hasher: std.hash.Crc32,
        write_amt: u32,

        fn init(allocator: mem.Allocator, out_writer: WriterType, options: CompressOptions) !Self {
            var hasher = std.compress.hashedWriter(out_writer, std.hash.Crc32.init());
            const hashed_writer = hasher.writer();

            // ID1/ID2
            try hashed_writer.writeAll(magic);
            // CM
            try hashed_writer.writeByte(8);
            // Flags
            try hashed_writer.writeByte(
                @as(u8, if (options.hash_header) FHCRC else 0) |
                    @as(u8, if (options.header.extra) |_| FEXTRA else 0) |
                    @as(u8, if (options.header.filename) |_| FNAME else 0) |
                    @as(u8, if (options.header.comment) |_| FCOMMENT else 0),
            );
            // Modification time
            try hashed_writer.writeInt(u32, options.header.modification_time, .little);
            // Extra flags
            try hashed_writer.writeByte(0);
            // Operating system
            try hashed_writer.writeByte(options.header.operating_system);

            if (options.header.extra) |extra| {
                try hashed_writer.writeInt(u16, @intCast(extra.len), .little);
                try hashed_writer.writeAll(extra);
            }

            if (options.header.filename) |filename| {
                try hashed_writer.writeAll(filename);
                try hashed_writer.writeByte(0);
            }

            if (options.header.comment) |comment| {
                try hashed_writer.writeAll(comment);
                try hashed_writer.writeByte(0);
            }

            if (options.hash_header) {
                try out_writer.writeInt(
                    u16,
                    @truncate(hasher.hasher.final()),
                    .little,
                );
            }

            return .{
                .allocator = allocator,
                .deflater = try deflate.compressor(allocator, out_writer, .{ .level = options.level }),
                .out_writer = out_writer,
                .hasher = std.hash.Crc32.init(),
                .write_amt = 0,
            };
        }

        pub fn deinit(self: *Self) void {
            self.deflater.deinit();
        }

        /// Implements the io.Writer interface
        pub fn write(self: *Self, buffer: []const u8) Error!usize {
            if (buffer.len == 0)
                return 0;

            // Write to the compressed stream and update the computed checksum
            const r = try self.deflater.write(buffer);
            self.hasher.update(buffer[0..r]);
            self.write_amt +%= @truncate(r);
            return r;
        }

        pub fn writer(self: *Self) Writer {
            return .{ .context = self };
        }

        pub fn flush(self: *Self) Error!void {
            try self.deflater.flush();
        }

        pub fn close(self: *Self) Error!void {
            try self.deflater.close();
            try self.out_writer.writeInt(u32, self.hasher.final(), .little);
            try self.out_writer.writeInt(u32, self.write_amt, .little);
        }
    };
}

pub fn compress(allocator: mem.Allocator, writer: anytype, options: CompressOptions) !Compress(@TypeOf(writer)) {
    return Compress(@TypeOf(writer)).init(allocator, writer, options);
}

fn testReader(expected: []const u8, data: []const u8) !void {
    var in_stream = io.fixedBufferStream(data);

    var gzip_stream = try decompress(testing.allocator, in_stream.reader());
    defer gzip_stream.deinit();

    // Read and decompress the whole file
    const buf = try gzip_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize));
    defer testing.allocator.free(buf);

    // Check against the reference
    try testing.expectEqualSlices(u8, expected, buf);
}

fn testWriter(expected: []const u8, data: []const u8, options: CompressOptions) !void {
    var actual = std.ArrayList(u8).init(testing.allocator);
    defer actual.deinit();

    var gzip_stream = try compress(testing.allocator, actual.writer(), options);
    defer gzip_stream.deinit();

    // Write and compress the whole file
    try gzip_stream.writer().writeAll(data);
    try gzip_stream.close();

    // Check against the reference
    try testing.expectEqualSlices(u8, expected, actual.items);
}

// All the test cases are obtained by compressing the RFC1952 text
//
// https://tools.ietf.org/rfc/rfc1952.txt length=25037 bytes
// SHA256=164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67
test "compressed data" {
    const plain = @embedFile("testdata/rfc1952.txt");
    const compressed = @embedFile("testdata/rfc1952.txt.gz");
    try testReader(plain, compressed);
    try testWriter(compressed, plain, .{
        .header = .{
            .filename = "rfc1952.txt",
            .modification_time = 1706533053,
            .operating_system = 3,
        },
    });
}

test "sanity checks" {
    // Truncated header
    try testing.expectError(
        error.EndOfStream,
        testReader(undefined, &[_]u8{ 0x1f, 0x8B }),
    );
    // Wrong CM
    try testing.expectError(
        error.InvalidCompression,
        testReader(undefined, &[_]u8{
            0x1f, 0x8b, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x03,
        }),
    );
    // Wrong checksum
    try testing.expectError(
        error.WrongChecksum,
        testReader(undefined, &[_]u8{
            0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01,
            0x00, 0x00, 0x00, 0x00,
        }),
    );
    // Truncated checksum
    try testing.expectError(
        error.EndOfStream,
        testReader(undefined, &[_]u8{
            0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00,
        }),
    );
    // Wrong initial size
    try testing.expectError(
        error.CorruptedData,
        testReader(undefined, &[_]u8{
            0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x00, 0x01,
        }),
    );
    // Truncated initial size field
    try testing.expectError(
        error.EndOfStream,
        testReader(undefined, &[_]u8{
            0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
            0x00, 0x00, 0x00,
        }),
    );
}

test "header checksum" {
    try testReader("", &[_]u8{
        // GZIP header
        0x1f, 0x8b, 0x08, 0x12, 0x00, 0x09, 0x6e, 0x88, 0x00, 0xff, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x00,

        // header.FHCRC (should cover entire header)
        0x99, 0xd6,

        // GZIP data
        0x01, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
    });
}