gzip: implement compression

This commit is contained in:
Jacob Young 2024-01-29 14:12:19 +01:00 committed by Andrew Kelley
parent 27d2d8e81f
commit 4dfca01de4
8 changed files with 221 additions and 56 deletions

View File

@ -21,7 +21,7 @@ pub fn HashedReader(
pub fn read(self: *@This(), buf: []u8) Error!usize { pub fn read(self: *@This(), buf: []u8) Error!usize {
const amt = try self.child_reader.read(buf); const amt = try self.child_reader.read(buf);
self.hasher.update(buf); self.hasher.update(buf[0..amt]);
return amt; return amt;
} }
@ -38,6 +38,36 @@ pub fn hashedReader(
return .{ .child_reader = reader, .hasher = hasher }; return .{ .child_reader = reader, .hasher = hasher };
} }
pub fn HashedWriter(
comptime WriterType: anytype,
comptime HasherType: anytype,
) type {
return struct {
child_writer: WriterType,
hasher: HasherType,
pub const Error = WriterType.Error;
pub const Writer = std.io.Writer(*@This(), Error, write);
pub fn write(self: *@This(), buf: []const u8) Error!usize {
const amt = try self.child_writer.write(buf);
self.hasher.update(buf[0..amt]);
return amt;
}
pub fn writer(self: *@This()) Writer {
return .{ .context = self };
}
};
}
pub fn hashedWriter(
writer: anytype,
hasher: anytype,
) HashedWriter(@TypeOf(writer), @TypeOf(hasher)) {
return .{ .child_writer = writer, .hasher = hasher };
}
test { test {
_ = deflate; _ = deflate;
_ = gzip; _ = gzip;

View File

@ -733,7 +733,7 @@ pub fn Compressor(comptime WriterType: anytype) type {
} }
/// Writes the compressed form of `input` to the underlying writer. /// Writes the compressed form of `input` to the underlying writer.
pub fn write(self: *Self, input: []const u8) !usize { pub fn write(self: *Self, input: []const u8) Error!usize {
var buf = input; var buf = input;
// writes data to hm_bw, which will eventually write the // writes data to hm_bw, which will eventually write the
@ -756,7 +756,7 @@ pub fn Compressor(comptime WriterType: anytype) type {
/// If the underlying writer returns an error, `flush()` returns that error. /// If the underlying writer returns an error, `flush()` returns that error.
/// ///
/// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH. /// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
pub fn flush(self: *Self) !void { pub fn flush(self: *Self) Error!void {
self.sync = true; self.sync = true;
try self.step(); try self.step();
try self.hm_bw.writeStoredHeader(0, false); try self.hm_bw.writeStoredHeader(0, false);
@ -956,7 +956,7 @@ pub fn Compressor(comptime WriterType: anytype) type {
} }
/// Writes any pending data to the underlying writer. /// Writes any pending data to the underlying writer.
pub fn close(self: *Self) !void { pub fn close(self: *Self) Error!void {
self.sync = true; self.sync = true;
try self.step(); try self.step();
try self.hm_bw.writeStoredHeader(0, true); try self.hm_bw.writeStoredHeader(0, true);

View File

@ -86,7 +86,7 @@ fn testSync(level: deflate.Compression, input: []const u8) !void {
read = try decomp.reader().readAll(&final); read = try decomp.reader().readAll(&final);
try testing.expectEqual(@as(usize, 0), read); // expect ended stream to return 0 bytes try testing.expectEqual(@as(usize, 0), read); // expect ended stream to return 0 bytes
_ = decomp.close(); try decomp.close();
} }
} }
@ -102,7 +102,7 @@ fn testSync(level: deflate.Compression, input: []const u8) !void {
defer testing.allocator.free(decompressed); defer testing.allocator.free(decompressed);
_ = try decomp.reader().readAll(decompressed); _ = try decomp.reader().readAll(decompressed);
_ = decomp.close(); try decomp.close();
try testing.expectEqualSlices(u8, input, decompressed); try testing.expectEqualSlices(u8, input, decompressed);
} }
@ -477,7 +477,7 @@ test "inflate reset" {
.readAllAlloc(testing.allocator, math.maxInt(usize)); .readAllAlloc(testing.allocator, math.maxInt(usize));
defer testing.allocator.free(decompressed_1); defer testing.allocator.free(decompressed_1);
_ = decomp.close(); try decomp.close();
try testing.expectEqualSlices(u8, strings[0], decompressed_0); try testing.expectEqualSlices(u8, strings[0], decompressed_0);
try testing.expectEqualSlices(u8, strings[1], decompressed_1); try testing.expectEqualSlices(u8, strings[1], decompressed_1);
@ -524,7 +524,7 @@ test "inflate reset dictionary" {
.readAllAlloc(testing.allocator, math.maxInt(usize)); .readAllAlloc(testing.allocator, math.maxInt(usize));
defer testing.allocator.free(decompressed_1); defer testing.allocator.free(decompressed_1);
_ = decomp.close(); try decomp.close();
try testing.expectEqualSlices(u8, strings[0], decompressed_0); try testing.expectEqualSlices(u8, strings[0], decompressed_0);
try testing.expectEqualSlices(u8, strings[1], decompressed_1); try testing.expectEqualSlices(u8, strings[1], decompressed_1);

View File

@ -477,11 +477,10 @@ pub fn Decompressor(comptime ReaderType: type) type {
} }
} }
pub fn close(self: *Self) ?Error { pub fn close(self: *Self) Error!void {
if (self.err == @as(?Error, error.EndOfStreamWithNoError)) { if (self.err) |err| {
return null; if (err != error.EndOfStreamWithNoError) return err;
} }
return self.err;
} }
// RFC 1951 section 3.2.7. // RFC 1951 section 3.2.7.
@ -880,7 +879,7 @@ pub fn Decompressor(comptime ReaderType: type) type {
/// Replaces the inner reader and dictionary with new_reader and new_dict. /// Replaces the inner reader and dictionary with new_reader and new_dict.
/// new_reader must be of the same type as the reader being replaced. /// new_reader must be of the same type as the reader being replaced.
pub fn reset(s: *Self, new_reader: ReaderType, new_dict: ?[]const u8) !void { pub fn reset(s: *Self, new_reader: ReaderType, new_dict: ?[]const u8) Error!void {
s.inner_reader = new_reader; s.inner_reader = new_reader;
s.step = nextBlock; s.step = nextBlock;
s.err = null; s.err = null;
@ -920,9 +919,7 @@ test "confirm decompressor resets" {
const buf = try decomp.reader().readAllAlloc(std.testing.allocator, 1024 * 100); const buf = try decomp.reader().readAllAlloc(std.testing.allocator, 1024 * 100);
defer std.testing.allocator.free(buf); defer std.testing.allocator.free(buf);
if (decomp.close()) |err| { try decomp.close();
return err;
}
try decomp.reset(stream.reader(), null); try decomp.reset(stream.reader(), null);
} }

View File

@ -83,7 +83,7 @@ test "best speed" {
defer decomp.deinit(); defer decomp.deinit();
const read = try decomp.reader().readAll(decompressed); const read = try decomp.reader().readAll(decompressed);
_ = decomp.close(); try decomp.close();
try testing.expectEqual(want.items.len, read); try testing.expectEqual(want.items.len, read);
try testing.expectEqualSlices(u8, want.items, decompressed); try testing.expectEqualSlices(u8, want.items, decompressed);
@ -150,7 +150,7 @@ test "best speed max match offset" {
var decomp = try inflate.decompressor(testing.allocator, fib.reader(), null); var decomp = try inflate.decompressor(testing.allocator, fib.reader(), null);
defer decomp.deinit(); defer decomp.deinit();
const read = try decomp.reader().readAll(decompressed); const read = try decomp.reader().readAll(decompressed);
_ = decomp.close(); try decomp.close();
try testing.expectEqual(src.len, read); try testing.expectEqual(src.len, read);
try testing.expectEqualSlices(u8, src, decompressed); try testing.expectEqualSlices(u8, src, decompressed);

View File

@ -124,7 +124,8 @@ pub fn HuffmanBitWriter(comptime WriterType: type) type {
if (self.err) { if (self.err) {
return; return;
} }
self.bytes_written += try self.inner_writer.write(b); try self.inner_writer.writeAll(b);
self.bytes_written += b.len;
} }
fn writeBits(self: *Self, b: u32, nb: u32) Error!void { fn writeBits(self: *Self, b: u32, nb: u32) Error!void {

View File

@ -1,5 +1,5 @@
// //
// Decompressor for GZIP data streams (RFC1952) // Compressor/Decompressor for GZIP data streams (RFC1952)
const std = @import("../std.zig"); const std = @import("../std.zig");
const io = std.io; const io = std.io;
@ -8,6 +8,8 @@ const testing = std.testing;
const mem = std.mem; const mem = std.mem;
const deflate = std.compress.deflate; const deflate = std.compress.deflate;
const magic = &[2]u8{ 0x1f, 0x8b };
// Flags for the FLG field in the header // Flags for the FLG field in the header
const FTEXT = 1 << 0; const FTEXT = 1 << 0;
const FHCRC = 1 << 1; const FHCRC = 1 << 1;
@ -17,6 +19,14 @@ const FCOMMENT = 1 << 4;
const max_string_len = 1024; const max_string_len = 1024;
pub const Header = struct {
extra: ?[]const u8 = null,
filename: ?[]const u8 = null,
comment: ?[]const u8 = null,
modification_time: u32 = 0,
operating_system: u8 = 255,
};
pub fn Decompress(comptime ReaderType: type) type { pub fn Decompress(comptime ReaderType: type) type {
return struct { return struct {
const Self = @This(); const Self = @This();
@ -30,25 +40,19 @@ pub fn Decompress(comptime ReaderType: type) type {
inflater: deflate.Decompressor(ReaderType), inflater: deflate.Decompressor(ReaderType),
in_reader: ReaderType, in_reader: ReaderType,
hasher: std.hash.Crc32, hasher: std.hash.Crc32,
read_amt: usize, read_amt: u32,
info: struct { info: Header,
extra: ?[]const u8,
filename: ?[]const u8,
comment: ?[]const u8,
modification_time: u32,
operating_system: u8,
},
fn init(allocator: mem.Allocator, source: ReaderType) !Self { fn init(allocator: mem.Allocator, in_reader: ReaderType) !Self {
var hasher = std.compress.hashedReader(source, std.hash.Crc32.init()); var hasher = std.compress.hashedReader(in_reader, std.hash.Crc32.init());
const hashed_reader = hasher.reader(); const hashed_reader = hasher.reader();
// gzip header format is specified in RFC1952 // gzip header format is specified in RFC1952
const header = try hashed_reader.readBytesNoEof(10); const header = try hashed_reader.readBytesNoEof(10);
// Check the ID1/ID2 fields // Check the ID1/ID2 fields
if (header[0] != 0x1f or header[1] != 0x8b) if (!std.mem.eql(u8, header[0..2], magic))
return error.BadHeader; return error.BadHeader;
const CM = header[2]; const CM = header[2];
@ -88,15 +92,15 @@ pub fn Decompress(comptime ReaderType: type) type {
errdefer if (comment) |p| allocator.free(p); errdefer if (comment) |p| allocator.free(p);
if (FLG & FHCRC != 0) { if (FLG & FHCRC != 0) {
const hash = try source.readInt(u16, .little); const hash = try in_reader.readInt(u16, .little);
if (hash != @as(u16, @truncate(hasher.hasher.final()))) if (hash != @as(u16, @truncate(hasher.hasher.final())))
return error.WrongChecksum; return error.WrongChecksum;
} }
return Self{ return .{
.allocator = allocator, .allocator = allocator,
.inflater = try deflate.decompressor(allocator, source, null), .inflater = try deflate.decompressor(allocator, in_reader, null),
.in_reader = source, .in_reader = in_reader,
.hasher = std.hash.Crc32.init(), .hasher = std.hash.Crc32.init(),
.info = .{ .info = .{
.filename = filename, .filename = filename,
@ -119,7 +123,7 @@ pub fn Decompress(comptime ReaderType: type) type {
self.allocator.free(comment); self.allocator.free(comment);
} }
// Implements the io.Reader interface /// Implements the io.Reader interface
pub fn read(self: *Self, buffer: []u8) Error!usize { pub fn read(self: *Self, buffer: []u8) Error!usize {
if (buffer.len == 0) if (buffer.len == 0)
return 0; return 0;
@ -128,10 +132,12 @@ pub fn Decompress(comptime ReaderType: type) type {
const r = try self.inflater.read(buffer); const r = try self.inflater.read(buffer);
if (r != 0) { if (r != 0) {
self.hasher.update(buffer[0..r]); self.hasher.update(buffer[0..r]);
self.read_amt += r; self.read_amt +%= @truncate(r);
return r; return r;
} }
try self.inflater.close();
// We've reached the end of stream, check if the checksum matches // We've reached the end of stream, check if the checksum matches
const hash = try self.in_reader.readInt(u32, .little); const hash = try self.in_reader.readInt(u32, .little);
if (hash != self.hasher.final()) if (hash != self.hasher.final())
@ -139,7 +145,7 @@ pub fn Decompress(comptime ReaderType: type) type {
// The ISIZE field is the size of the uncompressed input modulo 2^32 // The ISIZE field is the size of the uncompressed input modulo 2^32
const input_size = try self.in_reader.readInt(u32, .little); const input_size = try self.in_reader.readInt(u32, .little);
if (self.read_amt & 0xffffffff != input_size) if (self.read_amt != input_size)
return error.CorruptedData; return error.CorruptedData;
return 0; return 0;
@ -155,7 +161,117 @@ pub fn decompress(allocator: mem.Allocator, reader: anytype) !Decompress(@TypeOf
return Decompress(@TypeOf(reader)).init(allocator, reader); return Decompress(@TypeOf(reader)).init(allocator, reader);
} }
fn testReader(data: []const u8, comptime expected: []const u8) !void { pub const CompressOptions = struct {
header: Header = .{},
hash_header: bool = true,
level: deflate.Compression = .default_compression,
};
pub fn Compress(comptime WriterType: type) type {
return struct {
const Self = @This();
pub const Error = WriterType.Error ||
deflate.Compressor(WriterType).Error;
pub const Writer = io.Writer(*Self, Error, write);
allocator: mem.Allocator,
deflater: deflate.Compressor(WriterType),
out_writer: WriterType,
hasher: std.hash.Crc32,
write_amt: u32,
fn init(allocator: mem.Allocator, out_writer: WriterType, options: CompressOptions) !Self {
var hasher = std.compress.hashedWriter(out_writer, std.hash.Crc32.init());
const hashed_writer = hasher.writer();
// ID1/ID2
try hashed_writer.writeAll(magic);
// CM
try hashed_writer.writeByte(8);
// Flags
try hashed_writer.writeByte(
@as(u8, if (options.hash_header) FHCRC else 0) |
@as(u8, if (options.header.extra) |_| FEXTRA else 0) |
@as(u8, if (options.header.filename) |_| FNAME else 0) |
@as(u8, if (options.header.comment) |_| FCOMMENT else 0),
);
// Modification time
try hashed_writer.writeInt(u32, options.header.modification_time, .little);
// Extra flags
try hashed_writer.writeByte(0);
// Operating system
try hashed_writer.writeByte(options.header.operating_system);
if (options.header.extra) |extra| {
try hashed_writer.writeInt(u16, @intCast(extra.len), .little);
try hashed_writer.writeAll(extra);
}
if (options.header.filename) |filename| {
try hashed_writer.writeAll(filename);
try hashed_writer.writeByte(0);
}
if (options.header.comment) |comment| {
try hashed_writer.writeAll(comment);
try hashed_writer.writeByte(0);
}
if (options.hash_header) {
try out_writer.writeInt(
u16,
@truncate(hasher.hasher.final()),
.little,
);
}
return .{
.allocator = allocator,
.deflater = try deflate.compressor(allocator, out_writer, .{ .level = options.level }),
.out_writer = out_writer,
.hasher = std.hash.Crc32.init(),
.write_amt = 0,
};
}
pub fn deinit(self: *Self) void {
self.deflater.deinit();
}
/// Implements the io.Writer interface
pub fn write(self: *Self, buffer: []const u8) Error!usize {
if (buffer.len == 0)
return 0;
// Write to the compressed stream and update the computed checksum
const r = try self.deflater.write(buffer);
self.hasher.update(buffer[0..r]);
self.write_amt +%= @truncate(r);
return r;
}
pub fn writer(self: *Self) Writer {
return .{ .context = self };
}
pub fn flush(self: *Self) Error!void {
try self.deflater.flush();
}
pub fn close(self: *Self) Error!void {
try self.deflater.close();
try self.out_writer.writeInt(u32, self.hasher.final(), .little);
try self.out_writer.writeInt(u32, self.write_amt, .little);
}
};
}
pub fn compress(allocator: mem.Allocator, writer: anytype, options: CompressOptions) !Compress(@TypeOf(writer)) {
return Compress(@TypeOf(writer)).init(allocator, writer, options);
}
fn testReader(expected: []const u8, data: []const u8) !void {
var in_stream = io.fixedBufferStream(data); var in_stream = io.fixedBufferStream(data);
var gzip_stream = try decompress(testing.allocator, in_stream.reader()); var gzip_stream = try decompress(testing.allocator, in_stream.reader());
@ -169,70 +285,91 @@ fn testReader(data: []const u8, comptime expected: []const u8) !void {
try testing.expectEqualSlices(u8, expected, buf); try testing.expectEqualSlices(u8, expected, buf);
} }
fn testWriter(expected: []const u8, data: []const u8, options: CompressOptions) !void {
var actual = std.ArrayList(u8).init(testing.allocator);
defer actual.deinit();
var gzip_stream = try compress(testing.allocator, actual.writer(), options);
defer gzip_stream.deinit();
// Write and compress the whole file
try gzip_stream.writer().writeAll(data);
try gzip_stream.close();
// Check against the reference
try testing.expectEqualSlices(u8, expected, actual.items);
}
// All the test cases are obtained by compressing the RFC1952 text // All the test cases are obtained by compressing the RFC1952 text
// //
// https://tools.ietf.org/rfc/rfc1952.txt length=25037 bytes // https://tools.ietf.org/rfc/rfc1952.txt length=25037 bytes
// SHA256=164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67 // SHA256=164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67
test "compressed data" { test "compressed data" {
try testReader( const plain = @embedFile("testdata/rfc1952.txt");
@embedFile("testdata/rfc1952.txt.gz"), const compressed = @embedFile("testdata/rfc1952.txt.gz");
@embedFile("testdata/rfc1952.txt"), try testReader(plain, compressed);
); try testWriter(compressed, plain, .{
.header = .{
.filename = "rfc1952.txt",
.modification_time = 1706533053,
.operating_system = 3,
},
});
} }
test "sanity checks" { test "sanity checks" {
// Truncated header // Truncated header
try testing.expectError( try testing.expectError(
error.EndOfStream, error.EndOfStream,
testReader(&[_]u8{ 0x1f, 0x8B }, ""), testReader(undefined, &[_]u8{ 0x1f, 0x8B }),
); );
// Wrong CM // Wrong CM
try testing.expectError( try testing.expectError(
error.InvalidCompression, error.InvalidCompression,
testReader(&[_]u8{ testReader(undefined, &[_]u8{
0x1f, 0x8b, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x8b, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x00, 0x03,
}, ""), }),
); );
// Wrong checksum // Wrong checksum
try testing.expectError( try testing.expectError(
error.WrongChecksum, error.WrongChecksum,
testReader(&[_]u8{ testReader(undefined, &[_]u8{
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}, ""), }),
); );
// Truncated checksum // Truncated checksum
try testing.expectError( try testing.expectError(
error.EndOfStream, error.EndOfStream,
testReader(&[_]u8{ testReader(undefined, &[_]u8{
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00,
}, ""), }),
); );
// Wrong initial size // Wrong initial size
try testing.expectError( try testing.expectError(
error.CorruptedData, error.CorruptedData,
testReader(&[_]u8{ testReader(undefined, &[_]u8{
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
}, ""), }),
); );
// Truncated initial size field // Truncated initial size field
try testing.expectError( try testing.expectError(
error.EndOfStream, error.EndOfStream,
testReader(&[_]u8{ testReader(undefined, &[_]u8{
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}, ""), }),
); );
} }
test "header checksum" { test "header checksum" {
try testReader(&[_]u8{ try testReader("", &[_]u8{
// GZIP header // GZIP header
0x1f, 0x8b, 0x08, 0x12, 0x00, 0x09, 0x6e, 0x88, 0x00, 0xff, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x00, 0x1f, 0x8b, 0x08, 0x12, 0x00, 0x09, 0x6e, 0x88, 0x00, 0xff, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x00,
@ -241,5 +378,5 @@ test "header checksum" {
// GZIP data // GZIP data
0x01, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}, ""); });
} }

Binary file not shown.