mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 14:23:09 +00:00
remove v1 deflate implementation
This commit is contained in:
parent
e20080be13
commit
2457b68b2f
@ -150,8 +150,7 @@ pub fn build(b: *std.Build) !void {
|
|||||||
"rfc1951.txt",
|
"rfc1951.txt",
|
||||||
"rfc1952.txt",
|
"rfc1952.txt",
|
||||||
"rfc8478.txt",
|
"rfc8478.txt",
|
||||||
// exclude files from lib/std/compress/deflate/testdata
|
// exclude files from lib/std/compress/flate/testdata
|
||||||
// and lib/std/compress/flate/testdata
|
|
||||||
".expect",
|
".expect",
|
||||||
".expect-noinput",
|
".expect-noinput",
|
||||||
".golden",
|
".golden",
|
||||||
|
|||||||
@ -9,13 +9,6 @@ pub const flate = @import("compress/flate/root.zig").flate;
|
|||||||
pub const gzip = @import("compress/flate/root.zig").gzip;
|
pub const gzip = @import("compress/flate/root.zig").gzip;
|
||||||
pub const zlib = @import("compress/flate/root.zig").zlib;
|
pub const zlib = @import("compress/flate/root.zig").zlib;
|
||||||
|
|
||||||
// Version 1 interface
|
|
||||||
pub const v1 = struct {
|
|
||||||
pub const deflate = @import("compress/deflate.zig");
|
|
||||||
pub const gzip = @import("compress/gzip.zig");
|
|
||||||
pub const zlib = @import("compress/zlib.zig");
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn HashedReader(
|
pub fn HashedReader(
|
||||||
comptime ReaderType: anytype,
|
comptime ReaderType: anytype,
|
||||||
comptime HasherType: anytype,
|
comptime HasherType: anytype,
|
||||||
@ -77,12 +70,9 @@ pub fn hashedWriter(
|
|||||||
}
|
}
|
||||||
|
|
||||||
test {
|
test {
|
||||||
_ = v1.deflate;
|
|
||||||
_ = v1.gzip;
|
|
||||||
_ = lzma;
|
_ = lzma;
|
||||||
_ = lzma2;
|
_ = lzma2;
|
||||||
_ = xz;
|
_ = xz;
|
||||||
_ = v1.zlib;
|
|
||||||
_ = zstd;
|
_ = zstd;
|
||||||
_ = flate;
|
_ = flate;
|
||||||
_ = gzip;
|
_ = gzip;
|
||||||
|
|||||||
@ -1,44 +0,0 @@
|
|||||||
//! The deflate package is a translation of the Go code of the compress/flate package from
|
|
||||||
//! https://go.googlesource.com/go/+/refs/tags/go1.17/src/compress/flate/
|
|
||||||
|
|
||||||
const deflate = @import("deflate/compressor.zig");
|
|
||||||
const inflate = @import("deflate/decompressor.zig");
|
|
||||||
|
|
||||||
pub const Compression = deflate.Compression;
|
|
||||||
pub const CompressorOptions = deflate.CompressorOptions;
|
|
||||||
pub const Compressor = deflate.Compressor;
|
|
||||||
pub const Decompressor = inflate.Decompressor;
|
|
||||||
|
|
||||||
pub const compressor = deflate.compressor;
|
|
||||||
pub const decompressor = inflate.decompressor;
|
|
||||||
|
|
||||||
/// Copies elements from a source `src` slice into a destination `dst` slice.
|
|
||||||
/// The copy never returns an error but might not be complete if the destination is too small.
|
|
||||||
/// Returns the number of elements copied, which will be the minimum of `src.len` and `dst.len`.
|
|
||||||
/// TODO: remove this smelly function
|
|
||||||
pub fn copy(dst: []u8, src: []const u8) usize {
|
|
||||||
if (dst.len <= src.len) {
|
|
||||||
@memcpy(dst, src[0..dst.len]);
|
|
||||||
return dst.len;
|
|
||||||
} else {
|
|
||||||
@memcpy(dst[0..src.len], src);
|
|
||||||
return src.len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test {
|
|
||||||
_ = @import("deflate/token.zig");
|
|
||||||
_ = @import("deflate/bits_utils.zig");
|
|
||||||
_ = @import("deflate/dict_decoder.zig");
|
|
||||||
|
|
||||||
_ = @import("deflate/huffman_code.zig");
|
|
||||||
_ = @import("deflate/huffman_bit_writer.zig");
|
|
||||||
|
|
||||||
_ = @import("deflate/compressor.zig");
|
|
||||||
_ = @import("deflate/compressor_test.zig");
|
|
||||||
|
|
||||||
_ = @import("deflate/deflate_fast.zig");
|
|
||||||
_ = @import("deflate/deflate_fast_test.zig");
|
|
||||||
|
|
||||||
_ = @import("deflate/decompressor.zig");
|
|
||||||
}
|
|
||||||
@ -1,33 +0,0 @@
|
|||||||
const math = @import("std").math;
|
|
||||||
|
|
||||||
// Reverse bit-by-bit a N-bit code.
|
|
||||||
pub fn bitReverse(comptime T: type, value: T, N: usize) T {
|
|
||||||
const r = @bitReverse(value);
|
|
||||||
return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).Int.bits - N));
|
|
||||||
}
|
|
||||||
|
|
||||||
test "bitReverse" {
|
|
||||||
const std = @import("std");
|
|
||||||
|
|
||||||
const ReverseBitsTest = struct {
|
|
||||||
in: u16,
|
|
||||||
bit_count: u5,
|
|
||||||
out: u16,
|
|
||||||
};
|
|
||||||
|
|
||||||
const reverse_bits_tests = [_]ReverseBitsTest{
|
|
||||||
.{ .in = 1, .bit_count = 1, .out = 1 },
|
|
||||||
.{ .in = 1, .bit_count = 2, .out = 2 },
|
|
||||||
.{ .in = 1, .bit_count = 3, .out = 4 },
|
|
||||||
.{ .in = 1, .bit_count = 4, .out = 8 },
|
|
||||||
.{ .in = 1, .bit_count = 5, .out = 16 },
|
|
||||||
.{ .in = 17, .bit_count = 5, .out = 17 },
|
|
||||||
.{ .in = 257, .bit_count = 9, .out = 257 },
|
|
||||||
.{ .in = 29, .bit_count = 5, .out = 23 },
|
|
||||||
};
|
|
||||||
|
|
||||||
for (reverse_bits_tests) |h| {
|
|
||||||
const v = bitReverse(u16, h.in, h.bit_count);
|
|
||||||
try std.testing.expectEqual(h.out, v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,531 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
const expect = std.testing.expect;
|
|
||||||
const fifo = std.fifo;
|
|
||||||
const io = std.io;
|
|
||||||
const math = std.math;
|
|
||||||
const mem = std.mem;
|
|
||||||
const testing = std.testing;
|
|
||||||
|
|
||||||
const ArrayList = std.ArrayList;
|
|
||||||
|
|
||||||
const deflate = @import("compressor.zig");
|
|
||||||
const inflate = @import("decompressor.zig");
|
|
||||||
|
|
||||||
const compressor = deflate.compressor;
|
|
||||||
const decompressor = inflate.decompressor;
|
|
||||||
const huffman_only = deflate.huffman_only;
|
|
||||||
|
|
||||||
fn testSync(level: deflate.Compression, input: []const u8) !void {
|
|
||||||
if (input.len == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
var divided_buf = fifo
|
|
||||||
.LinearFifo(u8, fifo.LinearFifoBufferType.Dynamic)
|
|
||||||
.init(testing.allocator);
|
|
||||||
defer divided_buf.deinit();
|
|
||||||
var whole_buf = std.ArrayList(u8).init(testing.allocator);
|
|
||||||
defer whole_buf.deinit();
|
|
||||||
|
|
||||||
const multi_writer = io.multiWriter(.{
|
|
||||||
divided_buf.writer(),
|
|
||||||
whole_buf.writer(),
|
|
||||||
}).writer();
|
|
||||||
|
|
||||||
var comp = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
multi_writer,
|
|
||||||
.{ .level = level },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
{
|
|
||||||
var decomp = try decompressor(
|
|
||||||
testing.allocator,
|
|
||||||
divided_buf.reader(),
|
|
||||||
null,
|
|
||||||
);
|
|
||||||
defer decomp.deinit();
|
|
||||||
|
|
||||||
// Write first half of the input and flush()
|
|
||||||
const half: usize = (input.len + 1) / 2;
|
|
||||||
var half_len: usize = half - 0;
|
|
||||||
{
|
|
||||||
_ = try comp.writer().writeAll(input[0..half]);
|
|
||||||
|
|
||||||
// Flush
|
|
||||||
try comp.flush();
|
|
||||||
|
|
||||||
// Read back
|
|
||||||
const decompressed = try testing.allocator.alloc(u8, half_len);
|
|
||||||
defer testing.allocator.free(decompressed);
|
|
||||||
|
|
||||||
const read = try decomp.reader().readAll(decompressed); // read at least half
|
|
||||||
try testing.expectEqual(half_len, read);
|
|
||||||
try testing.expectEqualSlices(u8, input[0..half], decompressed);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write last half of the input and close()
|
|
||||||
half_len = input.len - half;
|
|
||||||
{
|
|
||||||
_ = try comp.writer().writeAll(input[half..]);
|
|
||||||
|
|
||||||
// Close
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
// Read back
|
|
||||||
const decompressed = try testing.allocator.alloc(u8, half_len);
|
|
||||||
defer testing.allocator.free(decompressed);
|
|
||||||
|
|
||||||
var read = try decomp.reader().readAll(decompressed);
|
|
||||||
try testing.expectEqual(half_len, read);
|
|
||||||
try testing.expectEqualSlices(u8, input[half..], decompressed);
|
|
||||||
|
|
||||||
// Extra read
|
|
||||||
var final: [10]u8 = undefined;
|
|
||||||
read = try decomp.reader().readAll(&final);
|
|
||||||
try testing.expectEqual(@as(usize, 0), read); // expect ended stream to return 0 bytes
|
|
||||||
|
|
||||||
try decomp.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_ = try comp.writer().writeAll(input);
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
// stream should work for ordinary reader too (reading whole_buf in one go)
|
|
||||||
const whole_buf_reader = io.fixedBufferStream(whole_buf.items).reader();
|
|
||||||
var decomp = try decompressor(testing.allocator, whole_buf_reader, null);
|
|
||||||
defer decomp.deinit();
|
|
||||||
|
|
||||||
const decompressed = try testing.allocator.alloc(u8, input.len);
|
|
||||||
defer testing.allocator.free(decompressed);
|
|
||||||
|
|
||||||
_ = try decomp.reader().readAll(decompressed);
|
|
||||||
try decomp.close();
|
|
||||||
|
|
||||||
try testing.expectEqualSlices(u8, input, decompressed);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn testToFromWithLevelAndLimit(level: deflate.Compression, input: []const u8, limit: u32) !void {
|
|
||||||
var compressed = std.ArrayList(u8).init(testing.allocator);
|
|
||||||
defer compressed.deinit();
|
|
||||||
|
|
||||||
var comp = try compressor(testing.allocator, compressed.writer(), .{ .level = level });
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
try comp.writer().writeAll(input);
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
if (limit > 0) {
|
|
||||||
try expect(compressed.items.len <= limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
var fib = io.fixedBufferStream(compressed.items);
|
|
||||||
var decomp = try decompressor(testing.allocator, fib.reader(), null);
|
|
||||||
defer decomp.deinit();
|
|
||||||
|
|
||||||
const decompressed = try testing.allocator.alloc(u8, input.len);
|
|
||||||
defer testing.allocator.free(decompressed);
|
|
||||||
|
|
||||||
const read: usize = try decomp.reader().readAll(decompressed);
|
|
||||||
try testing.expectEqual(input.len, read);
|
|
||||||
try testing.expectEqualSlices(u8, input, decompressed);
|
|
||||||
|
|
||||||
if (false) {
|
|
||||||
// TODO: this test has regressed
|
|
||||||
try testSync(level, input);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn testToFromWithLimit(input: []const u8, limit: [11]u32) !void {
|
|
||||||
try testToFromWithLevelAndLimit(.no_compression, input, limit[0]);
|
|
||||||
try testToFromWithLevelAndLimit(.best_speed, input, limit[1]);
|
|
||||||
try testToFromWithLevelAndLimit(.level_2, input, limit[2]);
|
|
||||||
try testToFromWithLevelAndLimit(.level_3, input, limit[3]);
|
|
||||||
try testToFromWithLevelAndLimit(.level_4, input, limit[4]);
|
|
||||||
try testToFromWithLevelAndLimit(.level_5, input, limit[5]);
|
|
||||||
try testToFromWithLevelAndLimit(.level_6, input, limit[6]);
|
|
||||||
try testToFromWithLevelAndLimit(.level_7, input, limit[7]);
|
|
||||||
try testToFromWithLevelAndLimit(.level_8, input, limit[8]);
|
|
||||||
try testToFromWithLevelAndLimit(.best_compression, input, limit[9]);
|
|
||||||
try testToFromWithLevelAndLimit(.huffman_only, input, limit[10]);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "deflate/inflate" {
|
|
||||||
const limits = [_]u32{0} ** 11;
|
|
||||||
|
|
||||||
var test0 = [_]u8{};
|
|
||||||
var test1 = [_]u8{0x11};
|
|
||||||
var test2 = [_]u8{ 0x11, 0x12 };
|
|
||||||
var test3 = [_]u8{ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11 };
|
|
||||||
var test4 = [_]u8{ 0x11, 0x10, 0x13, 0x41, 0x21, 0x21, 0x41, 0x13, 0x87, 0x78, 0x13 };
|
|
||||||
|
|
||||||
try testToFromWithLimit(&test0, limits);
|
|
||||||
try testToFromWithLimit(&test1, limits);
|
|
||||||
try testToFromWithLimit(&test2, limits);
|
|
||||||
try testToFromWithLimit(&test3, limits);
|
|
||||||
try testToFromWithLimit(&test4, limits);
|
|
||||||
|
|
||||||
var large_data_chunk = try testing.allocator.alloc(u8, 100_000);
|
|
||||||
defer testing.allocator.free(large_data_chunk);
|
|
||||||
// fill with random data
|
|
||||||
for (large_data_chunk, 0..) |_, i| {
|
|
||||||
large_data_chunk[i] = @as(u8, @truncate(i)) *% @as(u8, @truncate(i));
|
|
||||||
}
|
|
||||||
try testToFromWithLimit(large_data_chunk, limits);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "very long sparse chunk" {
|
|
||||||
// A SparseReader returns a stream consisting of 0s ending with 65,536 (1<<16) 1s.
|
|
||||||
// This tests missing hash references in a very large input.
|
|
||||||
const SparseReader = struct {
|
|
||||||
l: usize, // length
|
|
||||||
cur: usize, // current position
|
|
||||||
|
|
||||||
const Self = @This();
|
|
||||||
const Error = error{};
|
|
||||||
|
|
||||||
pub const Reader = io.Reader(*Self, Error, read);
|
|
||||||
|
|
||||||
pub fn reader(self: *Self) Reader {
|
|
||||||
return .{ .context = self };
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read(s: *Self, b: []u8) Error!usize {
|
|
||||||
var n: usize = 0; // amount read
|
|
||||||
|
|
||||||
if (s.cur >= s.l) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
n = b.len;
|
|
||||||
var cur = s.cur + n;
|
|
||||||
if (cur > s.l) {
|
|
||||||
n -= cur - s.l;
|
|
||||||
cur = s.l;
|
|
||||||
}
|
|
||||||
for (b[0..n], 0..) |_, i| {
|
|
||||||
if (s.cur + i >= s.l -| (1 << 16)) {
|
|
||||||
b[i] = 1;
|
|
||||||
} else {
|
|
||||||
b[i] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
s.cur = cur;
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
var comp = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
io.null_writer,
|
|
||||||
.{ .level = .best_speed },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
var writer = comp.writer();
|
|
||||||
|
|
||||||
var sparse = SparseReader{ .l = 0x23e8, .cur = 0 };
|
|
||||||
var reader = sparse.reader();
|
|
||||||
|
|
||||||
var read: usize = 1;
|
|
||||||
var written: usize = 0;
|
|
||||||
while (read > 0) {
|
|
||||||
var buf: [1 << 15]u8 = undefined; // 32,768 bytes buffer
|
|
||||||
read = try reader.read(&buf);
|
|
||||||
written += try writer.write(buf[0..read]);
|
|
||||||
}
|
|
||||||
try testing.expectEqual(@as(usize, 0x23e8), written);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "compressor reset" {
|
|
||||||
for (std.enums.values(deflate.Compression)) |c| {
|
|
||||||
try testWriterReset(c, null);
|
|
||||||
try testWriterReset(c, "dict");
|
|
||||||
try testWriterReset(c, "hello");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn testWriterReset(level: deflate.Compression, dict: ?[]const u8) !void {
|
|
||||||
const filler = struct {
|
|
||||||
fn writeData(c: anytype) !void {
|
|
||||||
const msg = "all your base are belong to us";
|
|
||||||
try c.writer().writeAll(msg);
|
|
||||||
try c.flush();
|
|
||||||
|
|
||||||
const hello = "hello world";
|
|
||||||
var i: usize = 0;
|
|
||||||
while (i < 1024) : (i += 1) {
|
|
||||||
try c.writer().writeAll(hello);
|
|
||||||
}
|
|
||||||
|
|
||||||
i = 0;
|
|
||||||
while (i < 65000) : (i += 1) {
|
|
||||||
try c.writer().writeAll("x");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
var buf1 = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer buf1.deinit();
|
|
||||||
var buf2 = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer buf2.deinit();
|
|
||||||
|
|
||||||
var comp = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
buf1.writer(),
|
|
||||||
.{ .level = level, .dictionary = dict },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
try filler.writeData(&comp);
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
comp.reset(buf2.writer());
|
|
||||||
try filler.writeData(&comp);
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
try testing.expectEqualSlices(u8, buf1.items, buf2.items);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "decompressor dictionary" {
|
|
||||||
const dict = "hello world"; // dictionary
|
|
||||||
const text = "hello again world";
|
|
||||||
|
|
||||||
var compressed = fifo
|
|
||||||
.LinearFifo(u8, fifo.LinearFifoBufferType.Dynamic)
|
|
||||||
.init(testing.allocator);
|
|
||||||
defer compressed.deinit();
|
|
||||||
|
|
||||||
var comp = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed.writer(),
|
|
||||||
.{
|
|
||||||
.level = .level_5,
|
|
||||||
.dictionary = null, // no dictionary
|
|
||||||
},
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
// imitate a compressor with a dictionary
|
|
||||||
try comp.writer().writeAll(dict);
|
|
||||||
try comp.flush();
|
|
||||||
compressed.discard(compressed.readableLength()); // empty the output
|
|
||||||
try comp.writer().writeAll(text);
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
const decompressed = try testing.allocator.alloc(u8, text.len);
|
|
||||||
defer testing.allocator.free(decompressed);
|
|
||||||
|
|
||||||
var decomp = try decompressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed.reader(),
|
|
||||||
dict,
|
|
||||||
);
|
|
||||||
defer decomp.deinit();
|
|
||||||
|
|
||||||
_ = try decomp.reader().readAll(decompressed);
|
|
||||||
try testing.expectEqualSlices(u8, "hello again world", decompressed);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "compressor dictionary" {
|
|
||||||
const dict = "hello world";
|
|
||||||
const text = "hello again world";
|
|
||||||
|
|
||||||
var compressed_nd = fifo
|
|
||||||
.LinearFifo(u8, fifo.LinearFifoBufferType.Dynamic)
|
|
||||||
.init(testing.allocator); // compressed with no dictionary
|
|
||||||
defer compressed_nd.deinit();
|
|
||||||
|
|
||||||
var compressed_d = ArrayList(u8).init(testing.allocator); // compressed with a dictionary
|
|
||||||
defer compressed_d.deinit();
|
|
||||||
|
|
||||||
// imitate a compressor with a dictionary
|
|
||||||
var comp_nd = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed_nd.writer(),
|
|
||||||
.{
|
|
||||||
.level = .level_5,
|
|
||||||
.dictionary = null, // no dictionary
|
|
||||||
},
|
|
||||||
);
|
|
||||||
defer comp_nd.deinit();
|
|
||||||
try comp_nd.writer().writeAll(dict);
|
|
||||||
try comp_nd.flush();
|
|
||||||
compressed_nd.discard(compressed_nd.readableLength()); // empty the output
|
|
||||||
try comp_nd.writer().writeAll(text);
|
|
||||||
try comp_nd.close();
|
|
||||||
|
|
||||||
// use a compressor with a dictionary
|
|
||||||
var comp_d = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed_d.writer(),
|
|
||||||
.{
|
|
||||||
.level = .level_5,
|
|
||||||
.dictionary = dict, // with a dictionary
|
|
||||||
},
|
|
||||||
);
|
|
||||||
defer comp_d.deinit();
|
|
||||||
try comp_d.writer().writeAll(text);
|
|
||||||
try comp_d.close();
|
|
||||||
|
|
||||||
try testing.expectEqualSlices(u8, compressed_d.items, compressed_nd.readableSlice(0));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the hash for best_speed only if d.index < d.maxInsertIndex
|
|
||||||
// See https://golang.org/issue/2508
|
|
||||||
test "Go non-regression test for 2508" {
|
|
||||||
var comp = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
io.null_writer,
|
|
||||||
.{ .level = .best_speed },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
var buf = [_]u8{0} ** 1024;
|
|
||||||
|
|
||||||
var i: usize = 0;
|
|
||||||
while (i < 131_072) : (i += 1) {
|
|
||||||
try comp.writer().writeAll(&buf);
|
|
||||||
try comp.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test "deflate/inflate string" {
|
|
||||||
const StringTest = struct {
|
|
||||||
filename: []const u8,
|
|
||||||
limit: [11]u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
const deflate_inflate_string_tests = [_]StringTest{
|
|
||||||
.{
|
|
||||||
.filename = "compress-e.txt",
|
|
||||||
.limit = [11]u32{
|
|
||||||
100_018, // no_compression
|
|
||||||
50_650, // best_speed
|
|
||||||
50_960, // 2
|
|
||||||
51_150, // 3
|
|
||||||
50_930, // 4
|
|
||||||
50_790, // 5
|
|
||||||
50_790, // 6
|
|
||||||
50_790, // 7
|
|
||||||
50_790, // 8
|
|
||||||
50_790, // best_compression
|
|
||||||
43_683, // huffman_only
|
|
||||||
},
|
|
||||||
},
|
|
||||||
.{
|
|
||||||
.filename = "rfc1951.txt",
|
|
||||||
.limit = [11]u32{
|
|
||||||
36_954, // no_compression
|
|
||||||
12_952, // best_speed
|
|
||||||
12_228, // 2
|
|
||||||
12_016, // 3
|
|
||||||
11_466, // 4
|
|
||||||
11_191, // 5
|
|
||||||
11_129, // 6
|
|
||||||
11_120, // 7
|
|
||||||
11_112, // 8
|
|
||||||
11_109, // best_compression
|
|
||||||
20_273, // huffman_only
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
inline for (deflate_inflate_string_tests) |t| {
|
|
||||||
const golden = @embedFile("testdata/" ++ t.filename);
|
|
||||||
try testToFromWithLimit(golden, t.limit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test "inflate reset" {
|
|
||||||
const strings = [_][]const u8{
|
|
||||||
"lorem ipsum izzle fo rizzle",
|
|
||||||
"the quick brown fox jumped over",
|
|
||||||
};
|
|
||||||
|
|
||||||
var compressed_strings = [_]ArrayList(u8){
|
|
||||||
ArrayList(u8).init(testing.allocator),
|
|
||||||
ArrayList(u8).init(testing.allocator),
|
|
||||||
};
|
|
||||||
defer compressed_strings[0].deinit();
|
|
||||||
defer compressed_strings[1].deinit();
|
|
||||||
|
|
||||||
for (strings, 0..) |s, i| {
|
|
||||||
var comp = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed_strings[i].writer(),
|
|
||||||
.{ .level = .level_6 },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
try comp.writer().writeAll(s);
|
|
||||||
try comp.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
var fib = io.fixedBufferStream(compressed_strings[0].items);
|
|
||||||
var decomp = try decompressor(testing.allocator, fib.reader(), null);
|
|
||||||
defer decomp.deinit();
|
|
||||||
|
|
||||||
const decompressed_0: []u8 = try decomp.reader()
|
|
||||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
|
||||||
defer testing.allocator.free(decompressed_0);
|
|
||||||
|
|
||||||
fib = io.fixedBufferStream(compressed_strings[1].items);
|
|
||||||
try decomp.reset(fib.reader(), null);
|
|
||||||
|
|
||||||
const decompressed_1: []u8 = try decomp.reader()
|
|
||||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
|
||||||
defer testing.allocator.free(decompressed_1);
|
|
||||||
|
|
||||||
try decomp.close();
|
|
||||||
|
|
||||||
try testing.expectEqualSlices(u8, strings[0], decompressed_0);
|
|
||||||
try testing.expectEqualSlices(u8, strings[1], decompressed_1);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "inflate reset dictionary" {
|
|
||||||
const dict = "the lorem fox";
|
|
||||||
const strings = [_][]const u8{
|
|
||||||
"lorem ipsum izzle fo rizzle",
|
|
||||||
"the quick brown fox jumped over",
|
|
||||||
};
|
|
||||||
|
|
||||||
var compressed_strings = [_]ArrayList(u8){
|
|
||||||
ArrayList(u8).init(testing.allocator),
|
|
||||||
ArrayList(u8).init(testing.allocator),
|
|
||||||
};
|
|
||||||
defer compressed_strings[0].deinit();
|
|
||||||
defer compressed_strings[1].deinit();
|
|
||||||
|
|
||||||
for (strings, 0..) |s, i| {
|
|
||||||
var comp = try compressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed_strings[i].writer(),
|
|
||||||
.{ .level = .level_6 },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
try comp.writer().writeAll(s);
|
|
||||||
try comp.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
var fib = io.fixedBufferStream(compressed_strings[0].items);
|
|
||||||
var decomp = try decompressor(testing.allocator, fib.reader(), dict);
|
|
||||||
defer decomp.deinit();
|
|
||||||
|
|
||||||
const decompressed_0: []u8 = try decomp.reader()
|
|
||||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
|
||||||
defer testing.allocator.free(decompressed_0);
|
|
||||||
|
|
||||||
fib = io.fixedBufferStream(compressed_strings[1].items);
|
|
||||||
try decomp.reset(fib.reader(), dict);
|
|
||||||
|
|
||||||
const decompressed_1: []u8 = try decomp.reader()
|
|
||||||
.readAllAlloc(testing.allocator, math.maxInt(usize));
|
|
||||||
defer testing.allocator.free(decompressed_1);
|
|
||||||
|
|
||||||
try decomp.close();
|
|
||||||
|
|
||||||
try testing.expectEqualSlices(u8, strings[0], decompressed_0);
|
|
||||||
try testing.expectEqualSlices(u8, strings[1], decompressed_1);
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,28 +0,0 @@
|
|||||||
// Deflate
|
|
||||||
|
|
||||||
// Biggest block size for uncompressed block.
|
|
||||||
pub const max_store_block_size = 65535;
|
|
||||||
// The special code used to mark the end of a block.
|
|
||||||
pub const end_block_marker = 256;
|
|
||||||
|
|
||||||
// LZ77
|
|
||||||
|
|
||||||
// The smallest match length per the RFC section 3.2.5
|
|
||||||
pub const base_match_length = 3;
|
|
||||||
// The smallest match offset.
|
|
||||||
pub const base_match_offset = 1;
|
|
||||||
// The largest match length.
|
|
||||||
pub const max_match_length = 258;
|
|
||||||
// The largest match offset.
|
|
||||||
pub const max_match_offset = 1 << 15;
|
|
||||||
|
|
||||||
// Huffman Codes
|
|
||||||
|
|
||||||
// The largest offset code.
|
|
||||||
pub const offset_code_count = 30;
|
|
||||||
// Max number of frequencies used for a Huffman Code
|
|
||||||
// Possible lengths are codegenCodeCount (19), offset_code_count (30) and max_num_lit (286).
|
|
||||||
// The largest of these is max_num_lit.
|
|
||||||
pub const max_num_frequencies = max_num_lit;
|
|
||||||
// Maximum number of literals.
|
|
||||||
pub const max_num_lit = 286;
|
|
||||||
@ -1,728 +0,0 @@
|
|||||||
// This encoding algorithm, which prioritizes speed over output size, is
|
|
||||||
// based on Snappy's LZ77-style encoder: github.com/golang/snappy
|
|
||||||
|
|
||||||
const std = @import("std");
|
|
||||||
const math = std.math;
|
|
||||||
const mem = std.mem;
|
|
||||||
|
|
||||||
const Allocator = std.mem.Allocator;
|
|
||||||
|
|
||||||
const deflate_const = @import("deflate_const.zig");
|
|
||||||
const deflate = @import("compressor.zig");
|
|
||||||
const token = @import("token.zig");
|
|
||||||
|
|
||||||
const base_match_length = deflate_const.base_match_length;
|
|
||||||
const base_match_offset = deflate_const.base_match_offset;
|
|
||||||
const max_match_length = deflate_const.max_match_length;
|
|
||||||
const max_match_offset = deflate_const.max_match_offset;
|
|
||||||
const max_store_block_size = deflate_const.max_store_block_size;
|
|
||||||
|
|
||||||
const table_bits = 14; // Bits used in the table.
|
|
||||||
const table_mask = table_size - 1; // Mask for table indices. Redundant, but can eliminate bounds checks.
|
|
||||||
const table_shift = 32 - table_bits; // Right-shift to get the table_bits most significant bits of a uint32.
|
|
||||||
const table_size = 1 << table_bits; // Size of the table.
|
|
||||||
|
|
||||||
// Reset the buffer offset when reaching this.
|
|
||||||
// Offsets are stored between blocks as i32 values.
|
|
||||||
// Since the offset we are checking against is at the beginning
|
|
||||||
// of the buffer, we need to subtract the current and input
|
|
||||||
// buffer to not risk overflowing the i32.
|
|
||||||
const buffer_reset = math.maxInt(i32) - max_store_block_size * 2;
|
|
||||||
|
|
||||||
fn load32(b: []u8, i: i32) u32 {
|
|
||||||
const s = b[@as(usize, @intCast(i)) .. @as(usize, @intCast(i)) + 4];
|
|
||||||
return @as(u32, @intCast(s[0])) |
|
|
||||||
@as(u32, @intCast(s[1])) << 8 |
|
|
||||||
@as(u32, @intCast(s[2])) << 16 |
|
|
||||||
@as(u32, @intCast(s[3])) << 24;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn load64(b: []u8, i: i32) u64 {
|
|
||||||
const s = b[@as(usize, @intCast(i))..@as(usize, @intCast(i + 8))];
|
|
||||||
return @as(u64, @intCast(s[0])) |
|
|
||||||
@as(u64, @intCast(s[1])) << 8 |
|
|
||||||
@as(u64, @intCast(s[2])) << 16 |
|
|
||||||
@as(u64, @intCast(s[3])) << 24 |
|
|
||||||
@as(u64, @intCast(s[4])) << 32 |
|
|
||||||
@as(u64, @intCast(s[5])) << 40 |
|
|
||||||
@as(u64, @intCast(s[6])) << 48 |
|
|
||||||
@as(u64, @intCast(s[7])) << 56;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn hash(u: u32) u32 {
|
|
||||||
return (u *% 0x1e35a7bd) >> table_shift;
|
|
||||||
}
|
|
||||||
|
|
||||||
// These constants are defined by the Snappy implementation so that its
|
|
||||||
// assembly implementation can fast-path some 16-bytes-at-a-time copies.
|
|
||||||
// They aren't necessary in the pure Go implementation, and may not be
|
|
||||||
// necessary in Zig, but using the same thresholds doesn't really hurt.
|
|
||||||
const input_margin = 16 - 1;
|
|
||||||
const min_non_literal_block_size = 1 + 1 + input_margin;
|
|
||||||
|
|
||||||
const TableEntry = struct {
|
|
||||||
val: u32, // Value at destination
|
|
||||||
offset: i32,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn deflateFast() DeflateFast {
|
|
||||||
return DeflateFast{
|
|
||||||
.table = [_]TableEntry{.{ .val = 0, .offset = 0 }} ** table_size,
|
|
||||||
.prev = undefined,
|
|
||||||
.prev_len = 0,
|
|
||||||
.cur = max_store_block_size,
|
|
||||||
.allocator = undefined,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// DeflateFast maintains the table for matches,
|
|
||||||
// and the previous byte block for cross block matching.
|
|
||||||
pub const DeflateFast = struct {
|
|
||||||
table: [table_size]TableEntry,
|
|
||||||
prev: []u8, // Previous block, zero length if unknown.
|
|
||||||
prev_len: u32, // Previous block length
|
|
||||||
cur: i32, // Current match offset.
|
|
||||||
allocator: Allocator,
|
|
||||||
|
|
||||||
const Self = @This();
|
|
||||||
|
|
||||||
pub fn init(self: *Self, allocator: Allocator) !void {
|
|
||||||
self.allocator = allocator;
|
|
||||||
self.prev = try allocator.alloc(u8, max_store_block_size);
|
|
||||||
self.prev_len = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *Self) void {
|
|
||||||
self.allocator.free(self.prev);
|
|
||||||
self.prev_len = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Encodes a block given in `src` and appends tokens to `dst` and returns the result.
|
|
||||||
pub fn encode(self: *Self, dst: []token.Token, tokens_count: *u16, src: []u8) void {
|
|
||||||
|
|
||||||
// Ensure that self.cur doesn't wrap.
|
|
||||||
if (self.cur >= buffer_reset) {
|
|
||||||
self.shiftOffsets();
|
|
||||||
}
|
|
||||||
|
|
||||||
// This check isn't in the Snappy implementation, but there, the caller
|
|
||||||
// instead of the callee handles this case.
|
|
||||||
if (src.len < min_non_literal_block_size) {
|
|
||||||
self.cur += max_store_block_size;
|
|
||||||
self.prev_len = 0;
|
|
||||||
emitLiteral(dst, tokens_count, src);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// s_limit is when to stop looking for offset/length copies. The input_margin
|
|
||||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
|
||||||
// looking for copies.
|
|
||||||
const s_limit = @as(i32, @intCast(src.len - input_margin));
|
|
||||||
|
|
||||||
// next_emit is where in src the next emitLiteral should start from.
|
|
||||||
var next_emit: i32 = 0;
|
|
||||||
var s: i32 = 0;
|
|
||||||
var cv: u32 = load32(src, s);
|
|
||||||
var next_hash: u32 = hash(cv);
|
|
||||||
|
|
||||||
outer: while (true) {
|
|
||||||
// Copied from the C++ snappy implementation:
|
|
||||||
//
|
|
||||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
|
||||||
// found, start looking only at every other byte. If 32 more bytes are
|
|
||||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
|
||||||
// is found, immediately go back to looking at every byte. This is a
|
|
||||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
|
||||||
// due to more bookkeeping, but for non-compressible data (such as
|
|
||||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
|
||||||
// data is incompressible and doesn't bother looking for matches
|
|
||||||
// everywhere.
|
|
||||||
//
|
|
||||||
// The "skip" variable keeps track of how many bytes there are since
|
|
||||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
|
||||||
// the number of bytes to move ahead for each iteration.
|
|
||||||
var skip: i32 = 32;
|
|
||||||
|
|
||||||
var next_s: i32 = s;
|
|
||||||
var candidate: TableEntry = undefined;
|
|
||||||
while (true) {
|
|
||||||
s = next_s;
|
|
||||||
const bytes_between_hash_lookups = skip >> 5;
|
|
||||||
next_s = s + bytes_between_hash_lookups;
|
|
||||||
skip += bytes_between_hash_lookups;
|
|
||||||
if (next_s > s_limit) {
|
|
||||||
break :outer;
|
|
||||||
}
|
|
||||||
candidate = self.table[next_hash & table_mask];
|
|
||||||
const now = load32(src, next_s);
|
|
||||||
self.table[next_hash & table_mask] = .{ .offset = s + self.cur, .val = cv };
|
|
||||||
next_hash = hash(now);
|
|
||||||
|
|
||||||
const offset = s - (candidate.offset - self.cur);
|
|
||||||
if (offset > max_match_offset or cv != candidate.val) {
|
|
||||||
// Out of range or not matched.
|
|
||||||
cv = now;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
|
||||||
// match. But, prior to the match, src[next_emit..s] are unmatched. Emit
|
|
||||||
// them as literal bytes.
|
|
||||||
emitLiteral(dst, tokens_count, src[@as(usize, @intCast(next_emit))..@as(usize, @intCast(s))]);
|
|
||||||
|
|
||||||
// Call emitCopy, and then see if another emitCopy could be our next
|
|
||||||
// move. Repeat until we find no match for the input immediately after
|
|
||||||
// what was consumed by the last emitCopy call.
|
|
||||||
//
|
|
||||||
// If we exit this loop normally then we need to call emitLiteral next,
|
|
||||||
// though we don't yet know how big the literal will be. We handle that
|
|
||||||
// by proceeding to the next iteration of the main loop. We also can
|
|
||||||
// exit this loop via goto if we get close to exhausting the input.
|
|
||||||
while (true) {
|
|
||||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
|
||||||
// literal bytes prior to s.
|
|
||||||
|
|
||||||
// Extend the 4-byte match as long as possible.
|
|
||||||
//
|
|
||||||
s += 4;
|
|
||||||
const t = candidate.offset - self.cur + 4;
|
|
||||||
const l = self.matchLen(s, t, src);
|
|
||||||
|
|
||||||
// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
|
|
||||||
dst[tokens_count.*] = token.matchToken(
|
|
||||||
@as(u32, @intCast(l + 4 - base_match_length)),
|
|
||||||
@as(u32, @intCast(s - t - base_match_offset)),
|
|
||||||
);
|
|
||||||
tokens_count.* += 1;
|
|
||||||
s += l;
|
|
||||||
next_emit = s;
|
|
||||||
if (s >= s_limit) {
|
|
||||||
break :outer;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We could immediately start working at s now, but to improve
|
|
||||||
// compression we first update the hash table at s-1 and at s. If
|
|
||||||
// another emitCopy is not our next move, also calculate next_hash
|
|
||||||
// at s+1. At least on amd64 architecture, these three hash calculations
|
|
||||||
// are faster as one load64 call (with some shifts) instead of
|
|
||||||
// three load32 calls.
|
|
||||||
var x = load64(src, s - 1);
|
|
||||||
const prev_hash = hash(@as(u32, @truncate(x)));
|
|
||||||
self.table[prev_hash & table_mask] = TableEntry{
|
|
||||||
.offset = self.cur + s - 1,
|
|
||||||
.val = @as(u32, @truncate(x)),
|
|
||||||
};
|
|
||||||
x >>= 8;
|
|
||||||
const curr_hash = hash(@as(u32, @truncate(x)));
|
|
||||||
candidate = self.table[curr_hash & table_mask];
|
|
||||||
self.table[curr_hash & table_mask] = TableEntry{
|
|
||||||
.offset = self.cur + s,
|
|
||||||
.val = @as(u32, @truncate(x)),
|
|
||||||
};
|
|
||||||
|
|
||||||
const offset = s - (candidate.offset - self.cur);
|
|
||||||
if (offset > max_match_offset or @as(u32, @truncate(x)) != candidate.val) {
|
|
||||||
cv = @as(u32, @truncate(x >> 8));
|
|
||||||
next_hash = hash(cv);
|
|
||||||
s += 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (@as(u32, @intCast(next_emit)) < src.len) {
|
|
||||||
emitLiteral(dst, tokens_count, src[@as(usize, @intCast(next_emit))..]);
|
|
||||||
}
|
|
||||||
self.cur += @as(i32, @intCast(src.len));
|
|
||||||
self.prev_len = @as(u32, @intCast(src.len));
|
|
||||||
@memcpy(self.prev[0..self.prev_len], src);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn emitLiteral(dst: []token.Token, tokens_count: *u16, lit: []u8) void {
|
|
||||||
for (lit) |v| {
|
|
||||||
dst[tokens_count.*] = token.literalToken(@as(u32, @intCast(v)));
|
|
||||||
tokens_count.* += 1;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// matchLen returns the match length between src[s..] and src[t..].
|
|
||||||
// t can be negative to indicate the match is starting in self.prev.
|
|
||||||
// We assume that src[s-4 .. s] and src[t-4 .. t] already match.
|
|
||||||
fn matchLen(self: *Self, s: i32, t: i32, src: []u8) i32 {
|
|
||||||
var s1 = @as(u32, @intCast(s)) + max_match_length - 4;
|
|
||||||
if (s1 > src.len) {
|
|
||||||
s1 = @as(u32, @intCast(src.len));
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we are inside the current block
|
|
||||||
if (t >= 0) {
|
|
||||||
var b = src[@as(usize, @intCast(t))..];
|
|
||||||
const a = src[@as(usize, @intCast(s))..@as(usize, @intCast(s1))];
|
|
||||||
b = b[0..a.len];
|
|
||||||
// Extend the match to be as long as possible.
|
|
||||||
for (a, 0..) |_, i| {
|
|
||||||
if (a[i] != b[i]) {
|
|
||||||
return @as(i32, @intCast(i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return @as(i32, @intCast(a.len));
|
|
||||||
}
|
|
||||||
|
|
||||||
// We found a match in the previous block.
|
|
||||||
const tp = @as(i32, @intCast(self.prev_len)) + t;
|
|
||||||
if (tp < 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extend the match to be as long as possible.
|
|
||||||
var a = src[@as(usize, @intCast(s))..@as(usize, @intCast(s1))];
|
|
||||||
var b = self.prev[@as(usize, @intCast(tp))..@as(usize, @intCast(self.prev_len))];
|
|
||||||
if (b.len > a.len) {
|
|
||||||
b = b[0..a.len];
|
|
||||||
}
|
|
||||||
a = a[0..b.len];
|
|
||||||
for (b, 0..) |_, i| {
|
|
||||||
if (a[i] != b[i]) {
|
|
||||||
return @as(i32, @intCast(i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we reached our limit, we matched everything we are
|
|
||||||
// allowed to in the previous block and we return.
|
|
||||||
const n = @as(i32, @intCast(b.len));
|
|
||||||
if (@as(u32, @intCast(s + n)) == s1) {
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Continue looking for more matches in the current block.
|
|
||||||
a = src[@as(usize, @intCast(s + n))..@as(usize, @intCast(s1))];
|
|
||||||
b = src[0..a.len];
|
|
||||||
for (a, 0..) |_, i| {
|
|
||||||
if (a[i] != b[i]) {
|
|
||||||
return @as(i32, @intCast(i)) + n;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return @as(i32, @intCast(a.len)) + n;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reset resets the encoding history.
|
|
||||||
// This ensures that no matches are made to the previous block.
|
|
||||||
pub fn reset(self: *Self) void {
|
|
||||||
self.prev_len = 0;
|
|
||||||
// Bump the offset, so all matches will fail distance check.
|
|
||||||
// Nothing should be >= self.cur in the table.
|
|
||||||
self.cur += max_match_offset;
|
|
||||||
|
|
||||||
// Protect against self.cur wraparound.
|
|
||||||
if (self.cur >= buffer_reset) {
|
|
||||||
self.shiftOffsets();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// shiftOffsets will shift down all match offset.
|
|
||||||
// This is only called in rare situations to prevent integer overflow.
|
|
||||||
//
|
|
||||||
// See https://golang.org/issue/18636 and https://golang.org/issues/34121.
|
|
||||||
fn shiftOffsets(self: *Self) void {
|
|
||||||
if (self.prev_len == 0) {
|
|
||||||
// We have no history; just clear the table.
|
|
||||||
for (self.table, 0..) |_, i| {
|
|
||||||
self.table[i] = TableEntry{ .val = 0, .offset = 0 };
|
|
||||||
}
|
|
||||||
self.cur = max_match_offset + 1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Shift down everything in the table that isn't already too far away.
|
|
||||||
for (self.table, 0..) |_, i| {
|
|
||||||
var v = self.table[i].offset - self.cur + max_match_offset + 1;
|
|
||||||
if (v < 0) {
|
|
||||||
// We want to reset self.cur to max_match_offset + 1, so we need to shift
|
|
||||||
// all table entries down by (self.cur - (max_match_offset + 1)).
|
|
||||||
// Because we ignore matches > max_match_offset, we can cap
|
|
||||||
// any negative offsets at 0.
|
|
||||||
v = 0;
|
|
||||||
}
|
|
||||||
self.table[i].offset = v;
|
|
||||||
}
|
|
||||||
self.cur = max_match_offset + 1;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
test "best speed match 1/3" {
|
|
||||||
if (@import("builtin").os.tag == .wasi) {
|
|
||||||
// https://github.com/ziglang/zig/issues/18885
|
|
||||||
return error.SkipZigTest;
|
|
||||||
}
|
|
||||||
const expectEqual = std.testing.expectEqual;
|
|
||||||
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 0, 0, 0, 1, 2 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 3, 4, 5, 0, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(3, -3, ¤t);
|
|
||||||
try expectEqual(@as(i32, 6), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 0, 0, 0, 1, 2 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 2, 4, 5, 0, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(3, -3, ¤t);
|
|
||||||
try expectEqual(@as(i32, 3), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 0, 0, 0, 1, 1 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 3, 4, 5, 0, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(3, -3, ¤t);
|
|
||||||
try expectEqual(@as(i32, 2), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 0, 0, 0, 1, 2 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(0, -1, ¤t);
|
|
||||||
try expectEqual(@as(i32, 4), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 0, 0, 0, 1, 2, 3, 4, 5, 2, 2 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(4, -7, ¤t);
|
|
||||||
try expectEqual(@as(i32, 5), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 9, 9, 9, 9, 9 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(0, -1, ¤t);
|
|
||||||
try expectEqual(@as(i32, 0), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 9, 9, 9, 9, 9 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(1, 0, ¤t);
|
|
||||||
try expectEqual(@as(i32, 0), got);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test "best speed match 2/3" {
|
|
||||||
if (@import("builtin").os.tag == .wasi) {
|
|
||||||
// https://github.com/ziglang/zig/issues/18885
|
|
||||||
return error.SkipZigTest;
|
|
||||||
}
|
|
||||||
const expectEqual = std.testing.expectEqual;
|
|
||||||
|
|
||||||
{
|
|
||||||
var previous = [_]u8{};
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(1, -5, ¤t);
|
|
||||||
try expectEqual(@as(i32, 0), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{};
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(1, -1, ¤t);
|
|
||||||
try expectEqual(@as(i32, 0), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{};
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(1, 0, ¤t);
|
|
||||||
try expectEqual(@as(i32, 3), got);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var previous = [_]u8{ 3, 4, 5 };
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = &previous,
|
|
||||||
.prev_len = previous.len,
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
var current = [_]u8{ 3, 4, 5 };
|
|
||||||
const got: i32 = e.matchLen(0, -3, ¤t);
|
|
||||||
try expectEqual(@as(i32, 3), got);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test "best speed match 2/2" {
|
|
||||||
const testing = std.testing;
|
|
||||||
const expectEqual = testing.expectEqual;
|
|
||||||
|
|
||||||
const Case = struct {
|
|
||||||
previous: u32,
|
|
||||||
current: u32,
|
|
||||||
s: i32,
|
|
||||||
t: i32,
|
|
||||||
expected: i32,
|
|
||||||
};
|
|
||||||
|
|
||||||
const cases = [_]Case{
|
|
||||||
.{
|
|
||||||
.previous = 1000,
|
|
||||||
.current = 1000,
|
|
||||||
.s = 0,
|
|
||||||
.t = -1000,
|
|
||||||
.expected = max_match_length - 4,
|
|
||||||
},
|
|
||||||
.{
|
|
||||||
.previous = 200,
|
|
||||||
.s = 0,
|
|
||||||
.t = -200,
|
|
||||||
.current = 500,
|
|
||||||
.expected = max_match_length - 4,
|
|
||||||
},
|
|
||||||
.{
|
|
||||||
.previous = 200,
|
|
||||||
.s = 1,
|
|
||||||
.t = 0,
|
|
||||||
.current = 500,
|
|
||||||
.expected = max_match_length - 4,
|
|
||||||
},
|
|
||||||
.{
|
|
||||||
.previous = max_match_length - 4,
|
|
||||||
.s = 0,
|
|
||||||
.t = -(max_match_length - 4),
|
|
||||||
.current = 500,
|
|
||||||
.expected = max_match_length - 4,
|
|
||||||
},
|
|
||||||
.{
|
|
||||||
.previous = 200,
|
|
||||||
.s = 400,
|
|
||||||
.t = -200,
|
|
||||||
.current = 500,
|
|
||||||
.expected = 100,
|
|
||||||
},
|
|
||||||
.{
|
|
||||||
.previous = 10,
|
|
||||||
.s = 400,
|
|
||||||
.t = 200,
|
|
||||||
.current = 500,
|
|
||||||
.expected = 100,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
for (cases) |c| {
|
|
||||||
const previous = try testing.allocator.alloc(u8, c.previous);
|
|
||||||
defer testing.allocator.free(previous);
|
|
||||||
@memset(previous, 0);
|
|
||||||
|
|
||||||
const current = try testing.allocator.alloc(u8, c.current);
|
|
||||||
defer testing.allocator.free(current);
|
|
||||||
@memset(current, 0);
|
|
||||||
|
|
||||||
var e = DeflateFast{
|
|
||||||
.prev = previous,
|
|
||||||
.prev_len = @as(u32, @intCast(previous.len)),
|
|
||||||
.table = undefined,
|
|
||||||
.allocator = undefined,
|
|
||||||
.cur = 0,
|
|
||||||
};
|
|
||||||
const got: i32 = e.matchLen(c.s, c.t, current);
|
|
||||||
try expectEqual(@as(i32, c.expected), got);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test "best speed shift offsets" {
|
|
||||||
const testing = std.testing;
|
|
||||||
const expect = std.testing.expect;
|
|
||||||
|
|
||||||
// Test if shiftoffsets properly preserves matches and resets out-of-range matches
|
|
||||||
// seen in https://github.com/golang/go/issues/4142
|
|
||||||
var enc = deflateFast();
|
|
||||||
try enc.init(testing.allocator);
|
|
||||||
defer enc.deinit();
|
|
||||||
|
|
||||||
// test_data may not generate internal matches.
|
|
||||||
var test_data = [32]u8{
|
|
||||||
0xf5, 0x25, 0xf2, 0x55, 0xf6, 0xc1, 0x1f, 0x0b, 0x10, 0xa1,
|
|
||||||
0xd0, 0x77, 0x56, 0x38, 0xf1, 0x9c, 0x7f, 0x85, 0xc5, 0xbd,
|
|
||||||
0x16, 0x28, 0xd4, 0xf9, 0x03, 0xd4, 0xc0, 0xa1, 0x1e, 0x58,
|
|
||||||
0x5b, 0xc9,
|
|
||||||
};
|
|
||||||
|
|
||||||
var tokens = [_]token.Token{0} ** 32;
|
|
||||||
var tokens_count: u16 = 0;
|
|
||||||
|
|
||||||
// Encode the testdata with clean state.
|
|
||||||
// Second part should pick up matches from the first block.
|
|
||||||
tokens_count = 0;
|
|
||||||
enc.encode(&tokens, &tokens_count, &test_data);
|
|
||||||
const want_first_tokens = tokens_count;
|
|
||||||
tokens_count = 0;
|
|
||||||
enc.encode(&tokens, &tokens_count, &test_data);
|
|
||||||
const want_second_tokens = tokens_count;
|
|
||||||
|
|
||||||
try expect(want_first_tokens > want_second_tokens);
|
|
||||||
|
|
||||||
// Forward the current indicator to before wraparound.
|
|
||||||
enc.cur = buffer_reset - @as(i32, @intCast(test_data.len));
|
|
||||||
|
|
||||||
// Part 1 before wrap, should match clean state.
|
|
||||||
tokens_count = 0;
|
|
||||||
enc.encode(&tokens, &tokens_count, &test_data);
|
|
||||||
var got = tokens_count;
|
|
||||||
try testing.expectEqual(want_first_tokens, got);
|
|
||||||
|
|
||||||
// Verify we are about to wrap.
|
|
||||||
try testing.expectEqual(@as(i32, buffer_reset), enc.cur);
|
|
||||||
|
|
||||||
// Part 2 should match clean state as well even if wrapped.
|
|
||||||
tokens_count = 0;
|
|
||||||
enc.encode(&tokens, &tokens_count, &test_data);
|
|
||||||
got = tokens_count;
|
|
||||||
try testing.expectEqual(want_second_tokens, got);
|
|
||||||
|
|
||||||
// Verify that we wrapped.
|
|
||||||
try expect(enc.cur < buffer_reset);
|
|
||||||
|
|
||||||
// Forward the current buffer, leaving the matches at the bottom.
|
|
||||||
enc.cur = buffer_reset;
|
|
||||||
enc.shiftOffsets();
|
|
||||||
|
|
||||||
// Ensure that no matches were picked up.
|
|
||||||
tokens_count = 0;
|
|
||||||
enc.encode(&tokens, &tokens_count, &test_data);
|
|
||||||
got = tokens_count;
|
|
||||||
try testing.expectEqual(want_first_tokens, got);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "best speed reset" {
|
|
||||||
// test that encoding is consistent across a warparound of the table offset.
|
|
||||||
// See https://github.com/golang/go/issues/34121
|
|
||||||
const fmt = std.fmt;
|
|
||||||
const testing = std.testing;
|
|
||||||
|
|
||||||
const ArrayList = std.ArrayList;
|
|
||||||
|
|
||||||
const input_size = 65536;
|
|
||||||
const input = try testing.allocator.alloc(u8, input_size);
|
|
||||||
defer testing.allocator.free(input);
|
|
||||||
|
|
||||||
var i: usize = 0;
|
|
||||||
while (i < input_size) : (i += 1) {
|
|
||||||
_ = try fmt.bufPrint(input, "asdfasdfasdfasdf{d}{d}fghfgujyut{d}yutyu\n", .{ i, i, i });
|
|
||||||
}
|
|
||||||
// This is specific to level 1 (best_speed).
|
|
||||||
const level = .best_speed;
|
|
||||||
const offset: usize = 1;
|
|
||||||
|
|
||||||
// We do an encode with a clean buffer to compare.
|
|
||||||
var want = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer want.deinit();
|
|
||||||
var clean_comp = try deflate.compressor(
|
|
||||||
testing.allocator,
|
|
||||||
want.writer(),
|
|
||||||
.{ .level = level },
|
|
||||||
);
|
|
||||||
defer clean_comp.deinit();
|
|
||||||
|
|
||||||
// Write 3 times, close.
|
|
||||||
try clean_comp.writer().writeAll(input);
|
|
||||||
try clean_comp.writer().writeAll(input);
|
|
||||||
try clean_comp.writer().writeAll(input);
|
|
||||||
try clean_comp.close();
|
|
||||||
|
|
||||||
var o = offset;
|
|
||||||
while (o <= 256) : (o *= 2) {
|
|
||||||
var discard = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer discard.deinit();
|
|
||||||
|
|
||||||
var comp = try deflate.compressor(
|
|
||||||
testing.allocator,
|
|
||||||
discard.writer(),
|
|
||||||
.{ .level = level },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
// Reset until we are right before the wraparound.
|
|
||||||
// Each reset adds max_match_offset to the offset.
|
|
||||||
i = 0;
|
|
||||||
const limit = (buffer_reset - input.len - o - max_match_offset) / max_match_offset;
|
|
||||||
while (i < limit) : (i += 1) {
|
|
||||||
// skip ahead to where we are close to wrap around...
|
|
||||||
comp.reset(discard.writer());
|
|
||||||
}
|
|
||||||
var got = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer got.deinit();
|
|
||||||
comp.reset(got.writer());
|
|
||||||
|
|
||||||
// Write 3 times, close.
|
|
||||||
try comp.writer().writeAll(input);
|
|
||||||
try comp.writer().writeAll(input);
|
|
||||||
try comp.writer().writeAll(input);
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
// output must match at wraparound
|
|
||||||
try testing.expectEqualSlices(u8, want.items, got.items);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,160 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
const expect = std.testing.expect;
|
|
||||||
const io = std.io;
|
|
||||||
const mem = std.mem;
|
|
||||||
const testing = std.testing;
|
|
||||||
|
|
||||||
const ArrayList = std.ArrayList;
|
|
||||||
|
|
||||||
const deflate = @import("compressor.zig");
|
|
||||||
const inflate = @import("decompressor.zig");
|
|
||||||
const deflate_const = @import("deflate_const.zig");
|
|
||||||
|
|
||||||
test "best speed" {
|
|
||||||
// Tests that round-tripping through deflate and then inflate recovers the original input.
|
|
||||||
// The Write sizes are near the thresholds in the compressor.encSpeed method (0, 16, 128), as well
|
|
||||||
// as near `deflate_const.max_store_block_size` (65535).
|
|
||||||
|
|
||||||
var abcabc = try testing.allocator.alloc(u8, 131_072);
|
|
||||||
defer testing.allocator.free(abcabc);
|
|
||||||
|
|
||||||
for (abcabc, 0..) |_, i| {
|
|
||||||
abcabc[i] = @as(u8, @intCast(i % 128));
|
|
||||||
}
|
|
||||||
|
|
||||||
var tc_01 = [_]u32{ 65536, 0 };
|
|
||||||
var tc_02 = [_]u32{ 65536, 1 };
|
|
||||||
var tc_03 = [_]u32{ 65536, 1, 256 };
|
|
||||||
var tc_04 = [_]u32{ 65536, 1, 65536 };
|
|
||||||
var tc_05 = [_]u32{ 65536, 14 };
|
|
||||||
var tc_06 = [_]u32{ 65536, 15 };
|
|
||||||
var tc_07 = [_]u32{ 65536, 16 };
|
|
||||||
var tc_08 = [_]u32{ 65536, 16, 256 };
|
|
||||||
var tc_09 = [_]u32{ 65536, 16, 65536 };
|
|
||||||
var tc_10 = [_]u32{ 65536, 127 };
|
|
||||||
var tc_11 = [_]u32{ 65536, 127 };
|
|
||||||
var tc_12 = [_]u32{ 65536, 128 };
|
|
||||||
var tc_13 = [_]u32{ 65536, 128, 256 };
|
|
||||||
var tc_14 = [_]u32{ 65536, 128, 65536 };
|
|
||||||
var tc_15 = [_]u32{ 65536, 129 };
|
|
||||||
var tc_16 = [_]u32{ 65536, 65536, 256 };
|
|
||||||
var tc_17 = [_]u32{ 65536, 65536, 65536 };
|
|
||||||
const test_cases = [_][]u32{
|
|
||||||
&tc_01, &tc_02, &tc_03, &tc_04, &tc_05, &tc_06, &tc_07, &tc_08, &tc_09, &tc_10,
|
|
||||||
&tc_11, &tc_12, &tc_13, &tc_14, &tc_15, &tc_16, &tc_17,
|
|
||||||
};
|
|
||||||
|
|
||||||
for (test_cases) |tc| {
|
|
||||||
const firsts = [_]u32{ 1, 65534, 65535, 65536, 65537, 131072 };
|
|
||||||
|
|
||||||
for (firsts) |first_n| {
|
|
||||||
tc[0] = first_n;
|
|
||||||
|
|
||||||
const to_flush = [_]bool{ false, true };
|
|
||||||
for (to_flush) |flush| {
|
|
||||||
var compressed = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer compressed.deinit();
|
|
||||||
|
|
||||||
var want = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer want.deinit();
|
|
||||||
|
|
||||||
var comp = try deflate.compressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed.writer(),
|
|
||||||
.{ .level = .best_speed },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
|
|
||||||
for (tc) |n| {
|
|
||||||
try want.appendSlice(abcabc[0..n]);
|
|
||||||
try comp.writer().writeAll(abcabc[0..n]);
|
|
||||||
if (flush) {
|
|
||||||
try comp.flush();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try comp.close();
|
|
||||||
|
|
||||||
const decompressed = try testing.allocator.alloc(u8, want.items.len);
|
|
||||||
defer testing.allocator.free(decompressed);
|
|
||||||
|
|
||||||
var fib = io.fixedBufferStream(compressed.items);
|
|
||||||
var decomp = try inflate.decompressor(testing.allocator, fib.reader(), null);
|
|
||||||
defer decomp.deinit();
|
|
||||||
|
|
||||||
const read = try decomp.reader().readAll(decompressed);
|
|
||||||
try decomp.close();
|
|
||||||
|
|
||||||
try testing.expectEqual(want.items.len, read);
|
|
||||||
try testing.expectEqualSlices(u8, want.items, decompressed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test "best speed max match offset" {
|
|
||||||
const abc = "abcdefgh";
|
|
||||||
const xyz = "stuvwxyz";
|
|
||||||
const input_margin = 16 - 1;
|
|
||||||
|
|
||||||
const match_before = [_]bool{ false, true };
|
|
||||||
for (match_before) |do_match_before| {
|
|
||||||
const extras = [_]u32{
|
|
||||||
0,
|
|
||||||
input_margin - 1,
|
|
||||||
input_margin,
|
|
||||||
input_margin + 1,
|
|
||||||
2 * input_margin,
|
|
||||||
};
|
|
||||||
for (extras) |extra| {
|
|
||||||
var offset_adj: i32 = -5;
|
|
||||||
while (offset_adj <= 5) : (offset_adj += 1) {
|
|
||||||
const offset = deflate_const.max_match_offset + offset_adj;
|
|
||||||
|
|
||||||
// Make src to be a []u8 of the form
|
|
||||||
// fmt("{s}{s}{s}{s}{s}", .{abc, zeros0, xyzMaybe, abc, zeros1})
|
|
||||||
// where:
|
|
||||||
// zeros0 is approximately max_match_offset zeros.
|
|
||||||
// xyzMaybe is either xyz or the empty string.
|
|
||||||
// zeros1 is between 0 and 30 zeros.
|
|
||||||
// The difference between the two abc's will be offset, which
|
|
||||||
// is max_match_offset plus or minus a small adjustment.
|
|
||||||
const src_len: usize = @as(usize, @intCast(offset + @as(i32, abc.len) + @as(i32, @intCast(extra))));
|
|
||||||
var src = try testing.allocator.alloc(u8, src_len);
|
|
||||||
defer testing.allocator.free(src);
|
|
||||||
|
|
||||||
@memcpy(src[0..abc.len], abc);
|
|
||||||
if (!do_match_before) {
|
|
||||||
const src_offset: usize = @as(usize, @intCast(offset - @as(i32, xyz.len)));
|
|
||||||
@memcpy(src[src_offset..][0..xyz.len], xyz);
|
|
||||||
}
|
|
||||||
const src_offset: usize = @as(usize, @intCast(offset));
|
|
||||||
@memcpy(src[src_offset..][0..abc.len], abc);
|
|
||||||
|
|
||||||
var compressed = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer compressed.deinit();
|
|
||||||
|
|
||||||
var comp = try deflate.compressor(
|
|
||||||
testing.allocator,
|
|
||||||
compressed.writer(),
|
|
||||||
.{ .level = .best_speed },
|
|
||||||
);
|
|
||||||
defer comp.deinit();
|
|
||||||
try comp.writer().writeAll(src);
|
|
||||||
_ = try comp.close();
|
|
||||||
|
|
||||||
const decompressed = try testing.allocator.alloc(u8, src.len);
|
|
||||||
defer testing.allocator.free(decompressed);
|
|
||||||
|
|
||||||
var fib = io.fixedBufferStream(compressed.items);
|
|
||||||
var decomp = try inflate.decompressor(testing.allocator, fib.reader(), null);
|
|
||||||
defer decomp.deinit();
|
|
||||||
const read = try decomp.reader().readAll(decompressed);
|
|
||||||
try decomp.close();
|
|
||||||
|
|
||||||
try testing.expectEqual(src.len, read);
|
|
||||||
try testing.expectEqualSlices(u8, src, decompressed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,423 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
const assert = std.debug.assert;
|
|
||||||
const mem = std.mem;
|
|
||||||
|
|
||||||
const Allocator = std.mem.Allocator;
|
|
||||||
|
|
||||||
// Implements the LZ77 sliding dictionary as used in decompression.
|
|
||||||
// LZ77 decompresses data through sequences of two forms of commands:
|
|
||||||
//
|
|
||||||
// * Literal insertions: Runs of one or more symbols are inserted into the data
|
|
||||||
// stream as is. This is accomplished through the writeByte method for a
|
|
||||||
// single symbol, or combinations of writeSlice/writeMark for multiple symbols.
|
|
||||||
// Any valid stream must start with a literal insertion if no preset dictionary
|
|
||||||
// is used.
|
|
||||||
//
|
|
||||||
// * Backward copies: Runs of one or more symbols are copied from previously
|
|
||||||
// emitted data. Backward copies come as the tuple (dist, length) where dist
|
|
||||||
// determines how far back in the stream to copy from and length determines how
|
|
||||||
// many bytes to copy. Note that it is valid for the length to be greater than
|
|
||||||
// the distance. Since LZ77 uses forward copies, that situation is used to
|
|
||||||
// perform a form of run-length encoding on repeated runs of symbols.
|
|
||||||
// The writeCopy and tryWriteCopy are used to implement this command.
|
|
||||||
//
|
|
||||||
// For performance reasons, this implementation performs little to no sanity
|
|
||||||
// checks about the arguments. As such, the invariants documented for each
|
|
||||||
// method call must be respected.
|
|
||||||
pub const DictDecoder = struct {
|
|
||||||
const Self = @This();
|
|
||||||
|
|
||||||
allocator: Allocator = undefined,
|
|
||||||
|
|
||||||
hist: []u8 = undefined, // Sliding window history
|
|
||||||
|
|
||||||
// Invariant: 0 <= rd_pos <= wr_pos <= hist.len
|
|
||||||
wr_pos: u32 = 0, // Current output position in buffer
|
|
||||||
rd_pos: u32 = 0, // Have emitted hist[0..rd_pos] already
|
|
||||||
full: bool = false, // Has a full window length been written yet?
|
|
||||||
|
|
||||||
// init initializes DictDecoder to have a sliding window dictionary of the given
|
|
||||||
// size. If a preset dict is provided, it will initialize the dictionary with
|
|
||||||
// the contents of dict.
|
|
||||||
pub fn init(self: *Self, allocator: Allocator, size: u32, dict: ?[]const u8) !void {
|
|
||||||
self.allocator = allocator;
|
|
||||||
|
|
||||||
self.hist = try allocator.alloc(u8, size);
|
|
||||||
|
|
||||||
self.wr_pos = 0;
|
|
||||||
|
|
||||||
if (dict != null) {
|
|
||||||
const src = dict.?[dict.?.len -| self.hist.len..];
|
|
||||||
@memcpy(self.hist[0..src.len], src);
|
|
||||||
self.wr_pos = @as(u32, @intCast(dict.?.len));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (self.wr_pos == self.hist.len) {
|
|
||||||
self.wr_pos = 0;
|
|
||||||
self.full = true;
|
|
||||||
}
|
|
||||||
self.rd_pos = self.wr_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *Self) void {
|
|
||||||
self.allocator.free(self.hist);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reports the total amount of historical data in the dictionary.
|
|
||||||
pub fn histSize(self: *Self) u32 {
|
|
||||||
if (self.full) {
|
|
||||||
return @as(u32, @intCast(self.hist.len));
|
|
||||||
}
|
|
||||||
return self.wr_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reports the number of bytes that can be flushed by readFlush.
|
|
||||||
pub fn availRead(self: *Self) u32 {
|
|
||||||
return self.wr_pos - self.rd_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reports the available amount of output buffer space.
|
|
||||||
pub fn availWrite(self: *Self) u32 {
|
|
||||||
return @as(u32, @intCast(self.hist.len - self.wr_pos));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a slice of the available buffer to write data to.
|
|
||||||
//
|
|
||||||
// This invariant will be kept: s.len <= availWrite()
|
|
||||||
pub fn writeSlice(self: *Self) []u8 {
|
|
||||||
return self.hist[self.wr_pos..];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Advances the writer pointer by `count`.
|
|
||||||
//
|
|
||||||
// This invariant must be kept: 0 <= count <= availWrite()
|
|
||||||
pub fn writeMark(self: *Self, count: u32) void {
|
|
||||||
assert(0 <= count and count <= self.availWrite());
|
|
||||||
self.wr_pos += count;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Writes a single byte to the dictionary.
|
|
||||||
//
|
|
||||||
// This invariant must be kept: 0 < availWrite()
|
|
||||||
pub fn writeByte(self: *Self, byte: u8) void {
|
|
||||||
self.hist[self.wr_pos] = byte;
|
|
||||||
self.wr_pos += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// TODO: eliminate this function because the callsites should care about whether
|
|
||||||
/// or not their arguments alias and then they should directly call `@memcpy` or
|
|
||||||
/// `mem.copyForwards`.
|
|
||||||
fn copy(dst: []u8, src: []const u8) u32 {
|
|
||||||
if (src.len > dst.len) {
|
|
||||||
mem.copyForwards(u8, dst, src[0..dst.len]);
|
|
||||||
return @as(u32, @intCast(dst.len));
|
|
||||||
}
|
|
||||||
mem.copyForwards(u8, dst[0..src.len], src);
|
|
||||||
return @as(u32, @intCast(src.len));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copies a string at a given (dist, length) to the output.
|
|
||||||
// This returns the number of bytes copied and may be less than the requested
|
|
||||||
// length if the available space in the output buffer is too small.
|
|
||||||
//
|
|
||||||
// This invariant must be kept: 0 < dist <= histSize()
|
|
||||||
pub fn writeCopy(self: *Self, dist: u32, length: u32) u32 {
|
|
||||||
assert(0 < dist and dist <= self.histSize());
|
|
||||||
const dst_base = self.wr_pos;
|
|
||||||
var dst_pos = dst_base;
|
|
||||||
var src_pos: i32 = @as(i32, @intCast(dst_pos)) - @as(i32, @intCast(dist));
|
|
||||||
var end_pos = dst_pos + length;
|
|
||||||
if (end_pos > self.hist.len) {
|
|
||||||
end_pos = @as(u32, @intCast(self.hist.len));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy non-overlapping section after destination position.
|
|
||||||
//
|
|
||||||
// This section is non-overlapping in that the copy length for this section
|
|
||||||
// is always less than or equal to the backwards distance. This can occur
|
|
||||||
// if a distance refers to data that wraps-around in the buffer.
|
|
||||||
// Thus, a backwards copy is performed here; that is, the exact bytes in
|
|
||||||
// the source prior to the copy is placed in the destination.
|
|
||||||
if (src_pos < 0) {
|
|
||||||
src_pos += @as(i32, @intCast(self.hist.len));
|
|
||||||
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[@as(usize, @intCast(src_pos))..]);
|
|
||||||
src_pos = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy possibly overlapping section before destination position.
|
|
||||||
//
|
|
||||||
// This section can overlap if the copy length for this section is larger
|
|
||||||
// than the backwards distance. This is allowed by LZ77 so that repeated
|
|
||||||
// strings can be succinctly represented using (dist, length) pairs.
|
|
||||||
// Thus, a forwards copy is performed here; that is, the bytes copied is
|
|
||||||
// possibly dependent on the resulting bytes in the destination as the copy
|
|
||||||
// progresses along. This is functionally equivalent to the following:
|
|
||||||
//
|
|
||||||
// var i = 0;
|
|
||||||
// while(i < end_pos - dst_pos) : (i+=1) {
|
|
||||||
// self.hist[dst_pos+i] = self.hist[src_pos+i];
|
|
||||||
// }
|
|
||||||
// dst_pos = end_pos;
|
|
||||||
//
|
|
||||||
while (dst_pos < end_pos) {
|
|
||||||
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[@as(usize, @intCast(src_pos))..dst_pos]);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.wr_pos = dst_pos;
|
|
||||||
return dst_pos - dst_base;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tries to copy a string at a given (distance, length) to the
|
|
||||||
// output. This specialized version is optimized for short distances.
|
|
||||||
//
|
|
||||||
// This method is designed to be inlined for performance reasons.
|
|
||||||
//
|
|
||||||
// This invariant must be kept: 0 < dist <= histSize()
|
|
||||||
pub fn tryWriteCopy(self: *Self, dist: u32, length: u32) u32 {
|
|
||||||
var dst_pos = self.wr_pos;
|
|
||||||
const end_pos = dst_pos + length;
|
|
||||||
if (dst_pos < dist or end_pos > self.hist.len) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
const dst_base = dst_pos;
|
|
||||||
const src_pos = dst_pos - dist;
|
|
||||||
|
|
||||||
// Copy possibly overlapping section before destination position.
|
|
||||||
while (dst_pos < end_pos) {
|
|
||||||
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[src_pos..dst_pos]);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.wr_pos = dst_pos;
|
|
||||||
return dst_pos - dst_base;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a slice of the historical buffer that is ready to be
|
|
||||||
// emitted to the user. The data returned by readFlush must be fully consumed
|
|
||||||
// before calling any other DictDecoder methods.
|
|
||||||
pub fn readFlush(self: *Self) []u8 {
|
|
||||||
const to_read = self.hist[self.rd_pos..self.wr_pos];
|
|
||||||
self.rd_pos = self.wr_pos;
|
|
||||||
if (self.wr_pos == self.hist.len) {
|
|
||||||
self.wr_pos = 0;
|
|
||||||
self.rd_pos = 0;
|
|
||||||
self.full = true;
|
|
||||||
}
|
|
||||||
return to_read;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// tests
|
|
||||||
|
|
||||||
test "dictionary decoder" {
|
|
||||||
const ArrayList = std.ArrayList;
|
|
||||||
const testing = std.testing;
|
|
||||||
|
|
||||||
const abc = "ABC\n";
|
|
||||||
const fox = "The quick brown fox jumped over the lazy dog!\n";
|
|
||||||
const poem: []const u8 =
|
|
||||||
\\The Road Not Taken
|
|
||||||
\\Robert Frost
|
|
||||||
\\
|
|
||||||
\\Two roads diverged in a yellow wood,
|
|
||||||
\\And sorry I could not travel both
|
|
||||||
\\And be one traveler, long I stood
|
|
||||||
\\And looked down one as far as I could
|
|
||||||
\\To where it bent in the undergrowth;
|
|
||||||
\\
|
|
||||||
\\Then took the other, as just as fair,
|
|
||||||
\\And having perhaps the better claim,
|
|
||||||
\\Because it was grassy and wanted wear;
|
|
||||||
\\Though as for that the passing there
|
|
||||||
\\Had worn them really about the same,
|
|
||||||
\\
|
|
||||||
\\And both that morning equally lay
|
|
||||||
\\In leaves no step had trodden black.
|
|
||||||
\\Oh, I kept the first for another day!
|
|
||||||
\\Yet knowing how way leads on to way,
|
|
||||||
\\I doubted if I should ever come back.
|
|
||||||
\\
|
|
||||||
\\I shall be telling this with a sigh
|
|
||||||
\\Somewhere ages and ages hence:
|
|
||||||
\\Two roads diverged in a wood, and I-
|
|
||||||
\\I took the one less traveled by,
|
|
||||||
\\And that has made all the difference.
|
|
||||||
\\
|
|
||||||
;
|
|
||||||
|
|
||||||
const uppercase: []const u8 =
|
|
||||||
\\THE ROAD NOT TAKEN
|
|
||||||
\\ROBERT FROST
|
|
||||||
\\
|
|
||||||
\\TWO ROADS DIVERGED IN A YELLOW WOOD,
|
|
||||||
\\AND SORRY I COULD NOT TRAVEL BOTH
|
|
||||||
\\AND BE ONE TRAVELER, LONG I STOOD
|
|
||||||
\\AND LOOKED DOWN ONE AS FAR AS I COULD
|
|
||||||
\\TO WHERE IT BENT IN THE UNDERGROWTH;
|
|
||||||
\\
|
|
||||||
\\THEN TOOK THE OTHER, AS JUST AS FAIR,
|
|
||||||
\\AND HAVING PERHAPS THE BETTER CLAIM,
|
|
||||||
\\BECAUSE IT WAS GRASSY AND WANTED WEAR;
|
|
||||||
\\THOUGH AS FOR THAT THE PASSING THERE
|
|
||||||
\\HAD WORN THEM REALLY ABOUT THE SAME,
|
|
||||||
\\
|
|
||||||
\\AND BOTH THAT MORNING EQUALLY LAY
|
|
||||||
\\IN LEAVES NO STEP HAD TRODDEN BLACK.
|
|
||||||
\\OH, I KEPT THE FIRST FOR ANOTHER DAY!
|
|
||||||
\\YET KNOWING HOW WAY LEADS ON TO WAY,
|
|
||||||
\\I DOUBTED IF I SHOULD EVER COME BACK.
|
|
||||||
\\
|
|
||||||
\\I SHALL BE TELLING THIS WITH A SIGH
|
|
||||||
\\SOMEWHERE AGES AND AGES HENCE:
|
|
||||||
\\TWO ROADS DIVERGED IN A WOOD, AND I-
|
|
||||||
\\I TOOK THE ONE LESS TRAVELED BY,
|
|
||||||
\\AND THAT HAS MADE ALL THE DIFFERENCE.
|
|
||||||
\\
|
|
||||||
;
|
|
||||||
|
|
||||||
const PoemRefs = struct {
|
|
||||||
dist: u32, // Backward distance (0 if this is an insertion)
|
|
||||||
length: u32, // Length of copy or insertion
|
|
||||||
};
|
|
||||||
|
|
||||||
const poem_refs = [_]PoemRefs{
|
|
||||||
.{ .dist = 0, .length = 38 }, .{ .dist = 33, .length = 3 }, .{ .dist = 0, .length = 48 },
|
|
||||||
.{ .dist = 79, .length = 3 }, .{ .dist = 0, .length = 11 }, .{ .dist = 34, .length = 5 },
|
|
||||||
.{ .dist = 0, .length = 6 }, .{ .dist = 23, .length = 7 }, .{ .dist = 0, .length = 8 },
|
|
||||||
.{ .dist = 50, .length = 3 }, .{ .dist = 0, .length = 2 }, .{ .dist = 69, .length = 3 },
|
|
||||||
.{ .dist = 34, .length = 5 }, .{ .dist = 0, .length = 4 }, .{ .dist = 97, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 4 }, .{ .dist = 43, .length = 5 }, .{ .dist = 0, .length = 6 },
|
|
||||||
.{ .dist = 7, .length = 4 }, .{ .dist = 88, .length = 7 }, .{ .dist = 0, .length = 12 },
|
|
||||||
.{ .dist = 80, .length = 3 }, .{ .dist = 0, .length = 2 }, .{ .dist = 141, .length = 4 },
|
|
||||||
.{ .dist = 0, .length = 1 }, .{ .dist = 196, .length = 3 }, .{ .dist = 0, .length = 3 },
|
|
||||||
.{ .dist = 157, .length = 3 }, .{ .dist = 0, .length = 6 }, .{ .dist = 181, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 2 }, .{ .dist = 23, .length = 3 }, .{ .dist = 77, .length = 3 },
|
|
||||||
.{ .dist = 28, .length = 5 }, .{ .dist = 128, .length = 3 }, .{ .dist = 110, .length = 4 },
|
|
||||||
.{ .dist = 70, .length = 3 }, .{ .dist = 0, .length = 4 }, .{ .dist = 85, .length = 6 },
|
|
||||||
.{ .dist = 0, .length = 2 }, .{ .dist = 182, .length = 6 }, .{ .dist = 0, .length = 4 },
|
|
||||||
.{ .dist = 133, .length = 3 }, .{ .dist = 0, .length = 7 }, .{ .dist = 47, .length = 5 },
|
|
||||||
.{ .dist = 0, .length = 20 }, .{ .dist = 112, .length = 5 }, .{ .dist = 0, .length = 1 },
|
|
||||||
.{ .dist = 58, .length = 3 }, .{ .dist = 0, .length = 8 }, .{ .dist = 59, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 4 }, .{ .dist = 173, .length = 3 }, .{ .dist = 0, .length = 5 },
|
|
||||||
.{ .dist = 114, .length = 3 }, .{ .dist = 0, .length = 4 }, .{ .dist = 92, .length = 5 },
|
|
||||||
.{ .dist = 0, .length = 2 }, .{ .dist = 71, .length = 3 }, .{ .dist = 0, .length = 2 },
|
|
||||||
.{ .dist = 76, .length = 5 }, .{ .dist = 0, .length = 1 }, .{ .dist = 46, .length = 3 },
|
|
||||||
.{ .dist = 96, .length = 4 }, .{ .dist = 130, .length = 4 }, .{ .dist = 0, .length = 3 },
|
|
||||||
.{ .dist = 360, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 178, .length = 5 },
|
|
||||||
.{ .dist = 0, .length = 7 }, .{ .dist = 75, .length = 3 }, .{ .dist = 0, .length = 3 },
|
|
||||||
.{ .dist = 45, .length = 6 }, .{ .dist = 0, .length = 6 }, .{ .dist = 299, .length = 6 },
|
|
||||||
.{ .dist = 180, .length = 3 }, .{ .dist = 70, .length = 6 }, .{ .dist = 0, .length = 1 },
|
|
||||||
.{ .dist = 48, .length = 3 }, .{ .dist = 66, .length = 4 }, .{ .dist = 0, .length = 3 },
|
|
||||||
.{ .dist = 47, .length = 5 }, .{ .dist = 0, .length = 9 }, .{ .dist = 325, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 1 }, .{ .dist = 359, .length = 3 }, .{ .dist = 318, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 2 }, .{ .dist = 199, .length = 3 }, .{ .dist = 0, .length = 1 },
|
|
||||||
.{ .dist = 344, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 248, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 10 }, .{ .dist = 310, .length = 3 }, .{ .dist = 0, .length = 3 },
|
|
||||||
.{ .dist = 93, .length = 6 }, .{ .dist = 0, .length = 3 }, .{ .dist = 252, .length = 3 },
|
|
||||||
.{ .dist = 157, .length = 4 }, .{ .dist = 0, .length = 2 }, .{ .dist = 273, .length = 5 },
|
|
||||||
.{ .dist = 0, .length = 14 }, .{ .dist = 99, .length = 4 }, .{ .dist = 0, .length = 1 },
|
|
||||||
.{ .dist = 464, .length = 4 }, .{ .dist = 0, .length = 2 }, .{ .dist = 92, .length = 4 },
|
|
||||||
.{ .dist = 495, .length = 3 }, .{ .dist = 0, .length = 1 }, .{ .dist = 322, .length = 4 },
|
|
||||||
.{ .dist = 16, .length = 4 }, .{ .dist = 0, .length = 3 }, .{ .dist = 402, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 2 }, .{ .dist = 237, .length = 4 }, .{ .dist = 0, .length = 2 },
|
|
||||||
.{ .dist = 432, .length = 4 }, .{ .dist = 0, .length = 1 }, .{ .dist = 483, .length = 5 },
|
|
||||||
.{ .dist = 0, .length = 2 }, .{ .dist = 294, .length = 4 }, .{ .dist = 0, .length = 2 },
|
|
||||||
.{ .dist = 306, .length = 3 }, .{ .dist = 113, .length = 5 }, .{ .dist = 0, .length = 1 },
|
|
||||||
.{ .dist = 26, .length = 4 }, .{ .dist = 164, .length = 3 }, .{ .dist = 488, .length = 4 },
|
|
||||||
.{ .dist = 0, .length = 1 }, .{ .dist = 542, .length = 3 }, .{ .dist = 248, .length = 6 },
|
|
||||||
.{ .dist = 0, .length = 5 }, .{ .dist = 205, .length = 3 }, .{ .dist = 0, .length = 8 },
|
|
||||||
.{ .dist = 48, .length = 3 }, .{ .dist = 449, .length = 6 }, .{ .dist = 0, .length = 2 },
|
|
||||||
.{ .dist = 192, .length = 3 }, .{ .dist = 328, .length = 4 }, .{ .dist = 9, .length = 5 },
|
|
||||||
.{ .dist = 433, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 622, .length = 25 },
|
|
||||||
.{ .dist = 615, .length = 5 }, .{ .dist = 46, .length = 5 }, .{ .dist = 0, .length = 2 },
|
|
||||||
.{ .dist = 104, .length = 3 }, .{ .dist = 475, .length = 10 }, .{ .dist = 549, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 4 }, .{ .dist = 597, .length = 8 }, .{ .dist = 314, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 1 }, .{ .dist = 473, .length = 6 }, .{ .dist = 317, .length = 5 },
|
|
||||||
.{ .dist = 0, .length = 1 }, .{ .dist = 400, .length = 3 }, .{ .dist = 0, .length = 3 },
|
|
||||||
.{ .dist = 109, .length = 3 }, .{ .dist = 151, .length = 3 }, .{ .dist = 48, .length = 4 },
|
|
||||||
.{ .dist = 0, .length = 4 }, .{ .dist = 125, .length = 3 }, .{ .dist = 108, .length = 3 },
|
|
||||||
.{ .dist = 0, .length = 2 },
|
|
||||||
};
|
|
||||||
|
|
||||||
var got_list = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer got_list.deinit();
|
|
||||||
var got = got_list.writer();
|
|
||||||
|
|
||||||
var want_list = ArrayList(u8).init(testing.allocator);
|
|
||||||
defer want_list.deinit();
|
|
||||||
var want = want_list.writer();
|
|
||||||
|
|
||||||
var dd = DictDecoder{};
|
|
||||||
try dd.init(testing.allocator, 1 << 11, null);
|
|
||||||
defer dd.deinit();
|
|
||||||
|
|
||||||
const util = struct {
|
|
||||||
fn writeCopy(dst_dd: *DictDecoder, dst: anytype, dist: u32, length: u32) !void {
|
|
||||||
var len = length;
|
|
||||||
while (len > 0) {
|
|
||||||
var n = dst_dd.tryWriteCopy(dist, len);
|
|
||||||
if (n == 0) {
|
|
||||||
n = dst_dd.writeCopy(dist, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
len -= n;
|
|
||||||
if (dst_dd.availWrite() == 0) {
|
|
||||||
_ = try dst.write(dst_dd.readFlush());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn writeString(dst_dd: *DictDecoder, dst: anytype, str: []const u8) !void {
|
|
||||||
var string = str;
|
|
||||||
while (string.len > 0) {
|
|
||||||
const cnt = DictDecoder.copy(dst_dd.writeSlice(), string);
|
|
||||||
dst_dd.writeMark(cnt);
|
|
||||||
string = string[cnt..];
|
|
||||||
if (dst_dd.availWrite() == 0) {
|
|
||||||
_ = try dst.write(dst_dd.readFlush());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
try util.writeString(&dd, got, ".");
|
|
||||||
_ = try want.write(".");
|
|
||||||
|
|
||||||
var str = poem;
|
|
||||||
for (poem_refs, 0..) |ref, i| {
|
|
||||||
_ = i;
|
|
||||||
if (ref.dist == 0) {
|
|
||||||
try util.writeString(&dd, got, str[0..ref.length]);
|
|
||||||
} else {
|
|
||||||
try util.writeCopy(&dd, got, ref.dist, ref.length);
|
|
||||||
}
|
|
||||||
str = str[ref.length..];
|
|
||||||
}
|
|
||||||
_ = try want.write(poem);
|
|
||||||
|
|
||||||
try util.writeCopy(&dd, got, dd.histSize(), 33);
|
|
||||||
_ = try want.write(want_list.items[0..33]);
|
|
||||||
|
|
||||||
try util.writeString(&dd, got, abc);
|
|
||||||
try util.writeCopy(&dd, got, abc.len, 59 * abc.len);
|
|
||||||
_ = try want.write(abc ** 60);
|
|
||||||
|
|
||||||
try util.writeString(&dd, got, fox);
|
|
||||||
try util.writeCopy(&dd, got, fox.len, 9 * fox.len);
|
|
||||||
_ = try want.write(fox ** 10);
|
|
||||||
|
|
||||||
try util.writeString(&dd, got, ".");
|
|
||||||
try util.writeCopy(&dd, got, 1, 9);
|
|
||||||
_ = try want.write("." ** 10);
|
|
||||||
|
|
||||||
try util.writeString(&dd, got, uppercase);
|
|
||||||
try util.writeCopy(&dd, got, uppercase.len, 7 * uppercase.len);
|
|
||||||
var i: u8 = 0;
|
|
||||||
while (i < 8) : (i += 1) {
|
|
||||||
_ = try want.write(uppercase);
|
|
||||||
}
|
|
||||||
|
|
||||||
try util.writeCopy(&dd, got, dd.histSize(), 10);
|
|
||||||
_ = try want.write(want_list.items[want_list.items.len - dd.histSize() ..][0..10]);
|
|
||||||
|
|
||||||
_ = try got.write(dd.readFlush());
|
|
||||||
try testing.expectEqualSlices(u8, want_list.items, got_list.items);
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,432 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
const assert = std.debug.assert;
|
|
||||||
const math = std.math;
|
|
||||||
const mem = std.mem;
|
|
||||||
const sort = std.sort;
|
|
||||||
const testing = std.testing;
|
|
||||||
|
|
||||||
const Allocator = std.mem.Allocator;
|
|
||||||
|
|
||||||
const bu = @import("bits_utils.zig");
|
|
||||||
const deflate_const = @import("deflate_const.zig");
|
|
||||||
|
|
||||||
const max_bits_limit = 16;
|
|
||||||
|
|
||||||
const LiteralNode = struct {
|
|
||||||
literal: u16,
|
|
||||||
freq: u16,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Describes the state of the constructed tree for a given depth.
|
|
||||||
const LevelInfo = struct {
|
|
||||||
// Our level. for better printing
|
|
||||||
level: u32,
|
|
||||||
|
|
||||||
// The frequency of the last node at this level
|
|
||||||
last_freq: u32,
|
|
||||||
|
|
||||||
// The frequency of the next character to add to this level
|
|
||||||
next_char_freq: u32,
|
|
||||||
|
|
||||||
// The frequency of the next pair (from level below) to add to this level.
|
|
||||||
// Only valid if the "needed" value of the next lower level is 0.
|
|
||||||
next_pair_freq: u32,
|
|
||||||
|
|
||||||
// The number of chains remaining to generate for this level before moving
|
|
||||||
// up to the next level
|
|
||||||
needed: u32,
|
|
||||||
};
|
|
||||||
|
|
||||||
// hcode is a huffman code with a bit code and bit length.
|
|
||||||
pub const HuffCode = struct {
|
|
||||||
code: u16 = 0,
|
|
||||||
len: u16 = 0,
|
|
||||||
|
|
||||||
// set sets the code and length of an hcode.
|
|
||||||
fn set(self: *HuffCode, code: u16, length: u16) void {
|
|
||||||
self.len = length;
|
|
||||||
self.code = code;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
pub const HuffmanEncoder = struct {
|
|
||||||
codes: []HuffCode,
|
|
||||||
freq_cache: []LiteralNode = undefined,
|
|
||||||
bit_count: [17]u32 = undefined,
|
|
||||||
lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate
|
|
||||||
lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate
|
|
||||||
allocator: Allocator,
|
|
||||||
|
|
||||||
pub fn deinit(self: *HuffmanEncoder) void {
|
|
||||||
self.allocator.free(self.codes);
|
|
||||||
self.allocator.free(self.freq_cache);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update this Huffman Code object to be the minimum code for the specified frequency count.
|
|
||||||
//
|
|
||||||
// freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
|
|
||||||
// max_bits The maximum number of bits to use for any literal.
|
|
||||||
pub fn generate(self: *HuffmanEncoder, freq: []u16, max_bits: u32) void {
|
|
||||||
var list = self.freq_cache[0 .. freq.len + 1];
|
|
||||||
// Number of non-zero literals
|
|
||||||
var count: u32 = 0;
|
|
||||||
// Set list to be the set of all non-zero literals and their frequencies
|
|
||||||
for (freq, 0..) |f, i| {
|
|
||||||
if (f != 0) {
|
|
||||||
list[count] = LiteralNode{ .literal = @as(u16, @intCast(i)), .freq = f };
|
|
||||||
count += 1;
|
|
||||||
} else {
|
|
||||||
list[count] = LiteralNode{ .literal = 0x00, .freq = 0 };
|
|
||||||
self.codes[i].len = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 };
|
|
||||||
|
|
||||||
list = list[0..count];
|
|
||||||
if (count <= 2) {
|
|
||||||
// Handle the small cases here, because they are awkward for the general case code. With
|
|
||||||
// two or fewer literals, everything has bit length 1.
|
|
||||||
for (list, 0..) |node, i| {
|
|
||||||
// "list" is in order of increasing literal value.
|
|
||||||
self.codes[node.literal].set(@as(u16, @intCast(i)), 1);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
self.lfs = list;
|
|
||||||
mem.sort(LiteralNode, self.lfs, {}, byFreq);
|
|
||||||
|
|
||||||
// Get the number of literals for each bit count
|
|
||||||
const bit_count = self.bitCounts(list, max_bits);
|
|
||||||
// And do the assignment
|
|
||||||
self.assignEncodingAndSize(bit_count, list);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn bitLength(self: *HuffmanEncoder, freq: []u16) u32 {
|
|
||||||
var total: u32 = 0;
|
|
||||||
for (freq, 0..) |f, i| {
|
|
||||||
if (f != 0) {
|
|
||||||
total += @as(u32, @intCast(f)) * @as(u32, @intCast(self.codes[i].len));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return total;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the number of literals assigned to each bit size in the Huffman encoding
|
|
||||||
//
|
|
||||||
// This method is only called when list.len >= 3
|
|
||||||
// The cases of 0, 1, and 2 literals are handled by special case code.
|
|
||||||
//
|
|
||||||
// list: An array of the literals with non-zero frequencies
|
|
||||||
// and their associated frequencies. The array is in order of increasing
|
|
||||||
// frequency, and has as its last element a special element with frequency
|
|
||||||
// std.math.maxInt(i32)
|
|
||||||
//
|
|
||||||
// max_bits: The maximum number of bits that should be used to encode any literal.
|
|
||||||
// Must be less than 16.
|
|
||||||
//
|
|
||||||
// Returns an integer array in which array[i] indicates the number of literals
|
|
||||||
// that should be encoded in i bits.
|
|
||||||
fn bitCounts(self: *HuffmanEncoder, list: []LiteralNode, max_bits_to_use: usize) []u32 {
|
|
||||||
var max_bits = max_bits_to_use;
|
|
||||||
const n = list.len;
|
|
||||||
|
|
||||||
assert(max_bits < max_bits_limit);
|
|
||||||
|
|
||||||
// The tree can't have greater depth than n - 1, no matter what. This
|
|
||||||
// saves a little bit of work in some small cases
|
|
||||||
max_bits = @min(max_bits, n - 1);
|
|
||||||
|
|
||||||
// Create information about each of the levels.
|
|
||||||
// A bogus "Level 0" whose sole purpose is so that
|
|
||||||
// level1.prev.needed == 0. This makes level1.next_pair_freq
|
|
||||||
// be a legitimate value that never gets chosen.
|
|
||||||
var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo);
|
|
||||||
// leaf_counts[i] counts the number of literals at the left
|
|
||||||
// of ancestors of the rightmost node at level i.
|
|
||||||
// leaf_counts[i][j] is the number of literals at the left
|
|
||||||
// of the level j ancestor.
|
|
||||||
var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32);
|
|
||||||
|
|
||||||
{
|
|
||||||
var level = @as(u32, 1);
|
|
||||||
while (level <= max_bits) : (level += 1) {
|
|
||||||
// For every level, the first two items are the first two characters.
|
|
||||||
// We initialize the levels as if we had already figured this out.
|
|
||||||
levels[level] = LevelInfo{
|
|
||||||
.level = level,
|
|
||||||
.last_freq = list[1].freq,
|
|
||||||
.next_char_freq = list[2].freq,
|
|
||||||
.next_pair_freq = list[0].freq + list[1].freq,
|
|
||||||
.needed = 0,
|
|
||||||
};
|
|
||||||
leaf_counts[level][level] = 2;
|
|
||||||
if (level == 1) {
|
|
||||||
levels[level].next_pair_freq = math.maxInt(i32);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We need a total of 2*n - 2 items at top level and have already generated 2.
|
|
||||||
levels[max_bits].needed = 2 * @as(u32, @intCast(n)) - 4;
|
|
||||||
|
|
||||||
{
|
|
||||||
var level = max_bits;
|
|
||||||
while (true) {
|
|
||||||
var l = &levels[level];
|
|
||||||
if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) {
|
|
||||||
// We've run out of both leafs and pairs.
|
|
||||||
// End all calculations for this level.
|
|
||||||
// To make sure we never come back to this level or any lower level,
|
|
||||||
// set next_pair_freq impossibly large.
|
|
||||||
l.needed = 0;
|
|
||||||
levels[level + 1].next_pair_freq = math.maxInt(i32);
|
|
||||||
level += 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
const prev_freq = l.last_freq;
|
|
||||||
if (l.next_char_freq < l.next_pair_freq) {
|
|
||||||
// The next item on this row is a leaf node.
|
|
||||||
const next = leaf_counts[level][level] + 1;
|
|
||||||
l.last_freq = l.next_char_freq;
|
|
||||||
// Lower leaf_counts are the same of the previous node.
|
|
||||||
leaf_counts[level][level] = next;
|
|
||||||
if (next >= list.len) {
|
|
||||||
l.next_char_freq = maxNode().freq;
|
|
||||||
} else {
|
|
||||||
l.next_char_freq = list[next].freq;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// The next item on this row is a pair from the previous row.
|
|
||||||
// next_pair_freq isn't valid until we generate two
|
|
||||||
// more values in the level below
|
|
||||||
l.last_freq = l.next_pair_freq;
|
|
||||||
// Take leaf counts from the lower level, except counts[level] remains the same.
|
|
||||||
@memcpy(leaf_counts[level][0..level], leaf_counts[level - 1][0..level]);
|
|
||||||
levels[l.level - 1].needed = 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
l.needed -= 1;
|
|
||||||
if (l.needed == 0) {
|
|
||||||
// We've done everything we need to do for this level.
|
|
||||||
// Continue calculating one level up. Fill in next_pair_freq
|
|
||||||
// of that level with the sum of the two nodes we've just calculated on
|
|
||||||
// this level.
|
|
||||||
if (l.level == max_bits) {
|
|
||||||
// All done!
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq;
|
|
||||||
level += 1;
|
|
||||||
} else {
|
|
||||||
// If we stole from below, move down temporarily to replenish it.
|
|
||||||
while (levels[level - 1].needed > 0) {
|
|
||||||
level -= 1;
|
|
||||||
if (level == 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Somethings is wrong if at the end, the top level is null or hasn't used
|
|
||||||
// all of the leaves.
|
|
||||||
assert(leaf_counts[max_bits][max_bits] == n);
|
|
||||||
|
|
||||||
var bit_count = self.bit_count[0 .. max_bits + 1];
|
|
||||||
var bits: u32 = 1;
|
|
||||||
const counts = &leaf_counts[max_bits];
|
|
||||||
{
|
|
||||||
var level = max_bits;
|
|
||||||
while (level > 0) : (level -= 1) {
|
|
||||||
// counts[level] gives the number of literals requiring at least "bits"
|
|
||||||
// bits to encode.
|
|
||||||
bit_count[bits] = counts[level] - counts[level - 1];
|
|
||||||
bits += 1;
|
|
||||||
if (level == 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return bit_count;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look at the leaves and assign them a bit count and an encoding as specified
|
|
||||||
// in RFC 1951 3.2.2
|
|
||||||
fn assignEncodingAndSize(self: *HuffmanEncoder, bit_count: []u32, list_arg: []LiteralNode) void {
|
|
||||||
var code = @as(u16, 0);
|
|
||||||
var list = list_arg;
|
|
||||||
|
|
||||||
for (bit_count, 0..) |bits, n| {
|
|
||||||
code <<= 1;
|
|
||||||
if (n == 0 or bits == 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// The literals list[list.len-bits] .. list[list.len-bits]
|
|
||||||
// are encoded using "bits" bits, and get the values
|
|
||||||
// code, code + 1, .... The code values are
|
|
||||||
// assigned in literal order (not frequency order).
|
|
||||||
const chunk = list[list.len - @as(u32, @intCast(bits)) ..];
|
|
||||||
|
|
||||||
self.lns = chunk;
|
|
||||||
mem.sort(LiteralNode, self.lns, {}, byLiteral);
|
|
||||||
|
|
||||||
for (chunk) |node| {
|
|
||||||
self.codes[node.literal] = HuffCode{
|
|
||||||
.code = bu.bitReverse(u16, code, @as(u5, @intCast(n))),
|
|
||||||
.len = @as(u16, @intCast(n)),
|
|
||||||
};
|
|
||||||
code += 1;
|
|
||||||
}
|
|
||||||
list = list[0 .. list.len - @as(u32, @intCast(bits))];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
fn maxNode() LiteralNode {
|
|
||||||
return LiteralNode{
|
|
||||||
.literal = math.maxInt(u16),
|
|
||||||
.freq = math.maxInt(u16),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn newHuffmanEncoder(allocator: Allocator, size: u32) !HuffmanEncoder {
|
|
||||||
return HuffmanEncoder{
|
|
||||||
.codes = try allocator.alloc(HuffCode, size),
|
|
||||||
// Allocate a reusable buffer with the longest possible frequency table.
|
|
||||||
// (deflate_const.max_num_frequencies).
|
|
||||||
.freq_cache = try allocator.alloc(LiteralNode, deflate_const.max_num_frequencies + 1),
|
|
||||||
.allocator = allocator,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generates a HuffmanCode corresponding to the fixed literal table
|
|
||||||
pub fn generateFixedLiteralEncoding(allocator: Allocator) !HuffmanEncoder {
|
|
||||||
const h = try newHuffmanEncoder(allocator, deflate_const.max_num_frequencies);
|
|
||||||
var codes = h.codes;
|
|
||||||
var ch: u16 = 0;
|
|
||||||
|
|
||||||
while (ch < deflate_const.max_num_frequencies) : (ch += 1) {
|
|
||||||
var bits: u16 = undefined;
|
|
||||||
var size: u16 = undefined;
|
|
||||||
switch (ch) {
|
|
||||||
0...143 => {
|
|
||||||
// size 8, 000110000 .. 10111111
|
|
||||||
bits = ch + 48;
|
|
||||||
size = 8;
|
|
||||||
},
|
|
||||||
144...255 => {
|
|
||||||
// size 9, 110010000 .. 111111111
|
|
||||||
bits = ch + 400 - 144;
|
|
||||||
size = 9;
|
|
||||||
},
|
|
||||||
256...279 => {
|
|
||||||
// size 7, 0000000 .. 0010111
|
|
||||||
bits = ch - 256;
|
|
||||||
size = 7;
|
|
||||||
},
|
|
||||||
else => {
|
|
||||||
// size 8, 11000000 .. 11000111
|
|
||||||
bits = ch + 192 - 280;
|
|
||||||
size = 8;
|
|
||||||
},
|
|
||||||
}
|
|
||||||
codes[ch] = HuffCode{ .code = bu.bitReverse(u16, bits, @as(u5, @intCast(size))), .len = size };
|
|
||||||
}
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn generateFixedOffsetEncoding(allocator: Allocator) !HuffmanEncoder {
|
|
||||||
const h = try newHuffmanEncoder(allocator, 30);
|
|
||||||
var codes = h.codes;
|
|
||||||
for (codes, 0..) |_, ch| {
|
|
||||||
codes[ch] = HuffCode{ .code = bu.bitReverse(u16, @as(u16, @intCast(ch)), 5), .len = 5 };
|
|
||||||
}
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool {
|
|
||||||
_ = context;
|
|
||||||
return a.literal < b.literal;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool {
|
|
||||||
_ = context;
|
|
||||||
if (a.freq == b.freq) {
|
|
||||||
return a.literal < b.literal;
|
|
||||||
}
|
|
||||||
return a.freq < b.freq;
|
|
||||||
}
|
|
||||||
|
|
||||||
test "generate a Huffman code from an array of frequencies" {
|
|
||||||
var freqs: [19]u16 = [_]u16{
|
|
||||||
8, // 0
|
|
||||||
1, // 1
|
|
||||||
1, // 2
|
|
||||||
2, // 3
|
|
||||||
5, // 4
|
|
||||||
10, // 5
|
|
||||||
9, // 6
|
|
||||||
1, // 7
|
|
||||||
0, // 8
|
|
||||||
0, // 9
|
|
||||||
0, // 10
|
|
||||||
0, // 11
|
|
||||||
0, // 12
|
|
||||||
0, // 13
|
|
||||||
0, // 14
|
|
||||||
0, // 15
|
|
||||||
1, // 16
|
|
||||||
3, // 17
|
|
||||||
5, // 18
|
|
||||||
};
|
|
||||||
|
|
||||||
var enc = try newHuffmanEncoder(testing.allocator, freqs.len);
|
|
||||||
defer enc.deinit();
|
|
||||||
enc.generate(freqs[0..], 7);
|
|
||||||
|
|
||||||
try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..]));
|
|
||||||
|
|
||||||
try testing.expectEqual(@as(usize, 3), enc.codes[0].len);
|
|
||||||
try testing.expectEqual(@as(usize, 6), enc.codes[1].len);
|
|
||||||
try testing.expectEqual(@as(usize, 6), enc.codes[2].len);
|
|
||||||
try testing.expectEqual(@as(usize, 5), enc.codes[3].len);
|
|
||||||
try testing.expectEqual(@as(usize, 3), enc.codes[4].len);
|
|
||||||
try testing.expectEqual(@as(usize, 2), enc.codes[5].len);
|
|
||||||
try testing.expectEqual(@as(usize, 2), enc.codes[6].len);
|
|
||||||
try testing.expectEqual(@as(usize, 6), enc.codes[7].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[8].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[9].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[10].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[11].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[12].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[13].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[14].len);
|
|
||||||
try testing.expectEqual(@as(usize, 0), enc.codes[15].len);
|
|
||||||
try testing.expectEqual(@as(usize, 6), enc.codes[16].len);
|
|
||||||
try testing.expectEqual(@as(usize, 5), enc.codes[17].len);
|
|
||||||
try testing.expectEqual(@as(usize, 3), enc.codes[18].len);
|
|
||||||
|
|
||||||
try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code);
|
|
||||||
try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "generate a Huffman code for the fixed literal table specific to Deflate" {
|
|
||||||
var enc = try generateFixedLiteralEncoding(testing.allocator);
|
|
||||||
defer enc.deinit();
|
|
||||||
}
|
|
||||||
|
|
||||||
test "generate a Huffman code for the 30 possible relative offsets (LZ77 distances) of Deflate" {
|
|
||||||
var enc = try generateFixedOffsetEncoding(testing.allocator);
|
|
||||||
defer enc.deinit();
|
|
||||||
}
|
|
||||||
File diff suppressed because one or more lines are too long
@ -1,29 +0,0 @@
|
|||||||
Four score and seven years ago our fathers brought forth on
|
|
||||||
this continent, a new nation, conceived in Liberty, and dedicated
|
|
||||||
to the proposition that all men are created equal.
|
|
||||||
Now we are engaged in a great Civil War, testing whether that
|
|
||||||
nation, or any nation so conceived and so dedicated, can long
|
|
||||||
endure.
|
|
||||||
We are met on a great battle-field of that war.
|
|
||||||
We have come to dedicate a portion of that field, as a final
|
|
||||||
resting place for those who here gave their lives that that
|
|
||||||
nation might live. It is altogether fitting and proper that
|
|
||||||
we should do this.
|
|
||||||
But, in a larger sense, we can not dedicate - we can not
|
|
||||||
consecrate - we can not hallow - this ground.
|
|
||||||
The brave men, living and dead, who struggled here, have
|
|
||||||
consecrated it, far above our poor power to add or detract.
|
|
||||||
The world will little note, nor long remember what we say here,
|
|
||||||
but it can never forget what they did here.
|
|
||||||
It is for us the living, rather, to be dedicated here to the
|
|
||||||
unfinished work which they who fought here have thus far so
|
|
||||||
nobly advanced. It is rather for us to be here dedicated to
|
|
||||||
the great task remaining before us - that from these honored
|
|
||||||
dead we take increased devotion to that cause for which they
|
|
||||||
gave the last full measure of devotion -
|
|
||||||
that we here highly resolve that these dead shall not have
|
|
||||||
died in vain - that this nation, under God, shall have a new
|
|
||||||
birth of freedom - and that government of the people, by the
|
|
||||||
people, for the people, shall not perish from this earth.
|
|
||||||
|
|
||||||
Abraham Lincoln, November 19, 1863, Gettysburg, Pennsylvania
|
|
||||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
lib/std/compress/deflate/testdata/huffman-pi.golden
vendored
BIN
lib/std/compress/deflate/testdata/huffman-pi.golden
vendored
Binary file not shown.
@ -1 +0,0 @@
|
|||||||
3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481117450284102701938521105559644622948954930381964428810975665933446128475648233786783165271201909145648566923460348610454326648213393607260249141273724587006606315588174881520920962829254091715364367892590360011330530548820466521384146951941511609433057270365759591953092186117381932611793105118548074462379962749567351885752724891227938183011949129833673362440656643086021394946395224737190702179860943702770539217176293176752384674818467669405132000568127145263560827785771342757789609173637178721468440901224953430146549585371050792279689258923542019956112129021960864034418159813629774771309960518707211349999998372978049951059731732816096318595024459455346908302642522308253344685035261931188171010003137838752886587533208381420617177669147303598253490428755468731159562863882353787593751957781857780532171226806613001927876611195909216420198938095257201065485863278865936153381827968230301952035301852968995773622599413891249721775283479131515574857242454150695950829533116861727855889075098381754637464939319255060400927701671139009848824012858361603563707660104710181942955596198946767837449448255379774726847104047534646208046684259069491293313677028989152104752162056966024058038150193511253382430035587640247496473263914199272604269922796782354781636009341721641219924586315030286182974555706749838505494588586926995690927210797509302955321165344987202755960236480665499119881834797753566369807426542527862551818417574672890977772793800081647060016145249192173217214772350141441973568548161361157352552133475741849468438523323907394143334547762416862518983569485562099219222184272550254256887671790494601653466804988627232791786085784383827967976681454100953883786360950680064225125205117392984896084128488626945604241965285022210661186306744278622039194945047123713786960956364371917287467764657573962413890865832645995813390478027590099465764078951269468398352595709825822620522489407726719478268482601476990902640136394437455305068203496252451749399651431429809190659250937221696461515709858387410597885959772975498930161753928468138268683868942774155991855925245953959431049972524680845987273644695848653836736222626099124608051243884390451244136549762780797715691435997700129616089441694868555848406353422072225828488648158456028506016842739452267467678895252138522549954666727823986456596116354886230577456498035593634568174324112515076069479451096596094025228879710893145669136867228748940560101503308617928680920874760917824938589009714909675985261365549781893129784821682998948722658804857564014270477555132379641451523746234364542858444795265867821051141354735739523113427166102135969536231442952484937187110145765403590279934403742007310578539062198387447808478489683321445713868751943506430218453191048481005370614680674919278191197939952061419663428754440643745123718192179998391015919561814675142691239748940907186494231961567945208095146550225231603881930142093762137855956638937787083039069792077346722182562599661501421503068038447734549202605414665925201497442850732518666002132434088190710486331734649651453905796268561005508106658796998163574736384052571459102897064140110971206280439039759515677157700420337869936007230558763176359421873125147120532928191826186125867321579198414848829164470609575270695722091756711672291098169091528017350671274858322287183520935396572512108357915136988209144421006751033467110314126711136990865851639831501970165151168517143765761835155650884909989859982387345528331635507647918535893226185489632132933089857064204675259070915481416549859461637180
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,4 +0,0 @@
|
|||||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
|
||||||
ř‹–vH
|
|
||||||
…”%€ŻÂţŤč ë†É·ĹŢę}‹ç>Úß˙lsŢĚçmŤIGH°čžň1YŢ4´[ĺŕ 0Â<30>[|]o#©
|
|
||||||
Ľ-#ľŮíul™ßýpfćîٱžn<C5BE>YŐÔ€Y<E282AC>w‰C8ÉŻ02š F=gn×ržN!OĆŕÔ{ŤĄö›kÜ*“w(ý´bÚ ç«kQC9/ ’lu>ô5ýC.÷¤uÚę›
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,2 +0,0 @@
|
|||||||
101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010
|
|
||||||
232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,14 +0,0 @@
|
|||||||
//Copyright2009ThGoAuthor.Allrightrrvd.
|
|
||||||
//UofthiourccodigovrndbyBSD-tyl
|
|
||||||
//licnthtcnbfoundinthLICENSEfil.
|
|
||||||
|
|
||||||
pckgmin
|
|
||||||
|
|
||||||
import"o"
|
|
||||||
|
|
||||||
funcmin(){
|
|
||||||
vrb=mk([]byt,65535)
|
|
||||||
f,_:=o.Crt("huffmn-null-mx.in")
|
|
||||||
f.Writ(b)
|
|
||||||
}
|
|
||||||
ABCDEFGHIJKLMNOPQRSTUVXxyz!"#¤%&/?"
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,14 +0,0 @@
|
|||||||
// zig v0.10.0
|
|
||||||
// create a file filled with 0x00
|
|
||||||
const std = @import("std");
|
|
||||||
|
|
||||||
pub fn main() !void {
|
|
||||||
var b = [1]u8{0} ** 65535;
|
|
||||||
const f = try std.fs.cwd().createFile(
|
|
||||||
"huffman-null-max.in",
|
|
||||||
.{ .read = true },
|
|
||||||
);
|
|
||||||
defer f.close();
|
|
||||||
|
|
||||||
_ = try f.writeAll(b[0..]);
|
|
||||||
}
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +0,0 @@
|
|||||||
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
955
lib/std/compress/deflate/testdata/rfc1951.txt
vendored
955
lib/std/compress/deflate/testdata/rfc1951.txt
vendored
@ -1,955 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Network Working Group P. Deutsch
|
|
||||||
Request for Comments: 1951 Aladdin Enterprises
|
|
||||||
Category: Informational May 1996
|
|
||||||
|
|
||||||
|
|
||||||
DEFLATE Compressed Data Format Specification version 1.3
|
|
||||||
|
|
||||||
Status of This Memo
|
|
||||||
|
|
||||||
This memo provides information for the Internet community. This memo
|
|
||||||
does not specify an Internet standard of any kind. Distribution of
|
|
||||||
this memo is unlimited.
|
|
||||||
|
|
||||||
IESG Note:
|
|
||||||
|
|
||||||
The IESG takes no position on the validity of any Intellectual
|
|
||||||
Property Rights statements contained in this document.
|
|
||||||
|
|
||||||
Notices
|
|
||||||
|
|
||||||
Copyright (c) 1996 L. Peter Deutsch
|
|
||||||
|
|
||||||
Permission is granted to copy and distribute this document for any
|
|
||||||
purpose and without charge, including translations into other
|
|
||||||
languages and incorporation into compilations, provided that the
|
|
||||||
copyright notice and this notice are preserved, and that any
|
|
||||||
substantive changes or deletions from the original are clearly
|
|
||||||
marked.
|
|
||||||
|
|
||||||
A pointer to the latest version of this and related documentation in
|
|
||||||
HTML format can be found at the URL
|
|
||||||
<ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>.
|
|
||||||
|
|
||||||
Abstract
|
|
||||||
|
|
||||||
This specification defines a lossless compressed data format that
|
|
||||||
compresses data using a combination of the LZ77 algorithm and Huffman
|
|
||||||
coding, with efficiency comparable to the best currently available
|
|
||||||
general-purpose compression methods. The data can be produced or
|
|
||||||
consumed, even for an arbitrarily long sequentially presented input
|
|
||||||
data stream, using only an a priori bounded amount of intermediate
|
|
||||||
storage. The format can be implemented readily in a manner not
|
|
||||||
covered by patents.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 1]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
Table of Contents
|
|
||||||
|
|
||||||
1. Introduction ................................................... 2
|
|
||||||
1.1. Purpose ................................................... 2
|
|
||||||
1.2. Intended audience ......................................... 3
|
|
||||||
1.3. Scope ..................................................... 3
|
|
||||||
1.4. Compliance ................................................ 3
|
|
||||||
1.5. Definitions of terms and conventions used ................ 3
|
|
||||||
1.6. Changes from previous versions ............................ 4
|
|
||||||
2. Compressed representation overview ............................. 4
|
|
||||||
3. Detailed specification ......................................... 5
|
|
||||||
3.1. Overall conventions ....................................... 5
|
|
||||||
3.1.1. Packing into bytes .................................. 5
|
|
||||||
3.2. Compressed block format ................................... 6
|
|
||||||
3.2.1. Synopsis of prefix and Huffman coding ............... 6
|
|
||||||
3.2.2. Use of Huffman coding in the "deflate" format ....... 7
|
|
||||||
3.2.3. Details of block format ............................. 9
|
|
||||||
3.2.4. Non-compressed blocks (BTYPE=00) ................... 11
|
|
||||||
3.2.5. Compressed blocks (length and distance codes) ...... 11
|
|
||||||
3.2.6. Compression with fixed Huffman codes (BTYPE=01) .... 12
|
|
||||||
3.2.7. Compression with dynamic Huffman codes (BTYPE=10) .. 13
|
|
||||||
3.3. Compliance ............................................... 14
|
|
||||||
4. Compression algorithm details ................................. 14
|
|
||||||
5. References .................................................... 16
|
|
||||||
6. Security Considerations ....................................... 16
|
|
||||||
7. Source code ................................................... 16
|
|
||||||
8. Acknowledgements .............................................. 16
|
|
||||||
9. Author's Address .............................................. 17
|
|
||||||
|
|
||||||
1. Introduction
|
|
||||||
|
|
||||||
1.1. Purpose
|
|
||||||
|
|
||||||
The purpose of this specification is to define a lossless
|
|
||||||
compressed data format that:
|
|
||||||
* Is independent of CPU type, operating system, file system,
|
|
||||||
and character set, and hence can be used for interchange;
|
|
||||||
* Can be produced or consumed, even for an arbitrarily long
|
|
||||||
sequentially presented input data stream, using only an a
|
|
||||||
priori bounded amount of intermediate storage, and hence
|
|
||||||
can be used in data communications or similar structures
|
|
||||||
such as Unix filters;
|
|
||||||
* Compresses data with efficiency comparable to the best
|
|
||||||
currently available general-purpose compression methods,
|
|
||||||
and in particular considerably better than the "compress"
|
|
||||||
program;
|
|
||||||
* Can be implemented readily in a manner not covered by
|
|
||||||
patents, and hence can be practiced freely;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 2]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
* Is compatible with the file format produced by the current
|
|
||||||
widely used gzip utility, in that conforming decompressors
|
|
||||||
will be able to read data produced by the existing gzip
|
|
||||||
compressor.
|
|
||||||
|
|
||||||
The data format defined by this specification does not attempt to:
|
|
||||||
|
|
||||||
* Allow random access to compressed data;
|
|
||||||
* Compress specialized data (e.g., raster graphics) as well
|
|
||||||
as the best currently available specialized algorithms.
|
|
||||||
|
|
||||||
A simple counting argument shows that no lossless compression
|
|
||||||
algorithm can compress every possible input data set. For the
|
|
||||||
format defined here, the worst case expansion is 5 bytes per 32K-
|
|
||||||
byte block, i.e., a size increase of 0.015% for large data sets.
|
|
||||||
English text usually compresses by a factor of 2.5 to 3;
|
|
||||||
executable files usually compress somewhat less; graphical data
|
|
||||||
such as raster images may compress much more.
|
|
||||||
|
|
||||||
1.2. Intended audience
|
|
||||||
|
|
||||||
This specification is intended for use by implementors of software
|
|
||||||
to compress data into "deflate" format and/or decompress data from
|
|
||||||
"deflate" format.
|
|
||||||
|
|
||||||
The text of the specification assumes a basic background in
|
|
||||||
programming at the level of bits and other primitive data
|
|
||||||
representations. Familiarity with the technique of Huffman coding
|
|
||||||
is helpful but not required.
|
|
||||||
|
|
||||||
1.3. Scope
|
|
||||||
|
|
||||||
The specification specifies a method for representing a sequence
|
|
||||||
of bytes as a (usually shorter) sequence of bits, and a method for
|
|
||||||
packing the latter bit sequence into bytes.
|
|
||||||
|
|
||||||
1.4. Compliance
|
|
||||||
|
|
||||||
Unless otherwise indicated below, a compliant decompressor must be
|
|
||||||
able to accept and decompress any data set that conforms to all
|
|
||||||
the specifications presented here; a compliant compressor must
|
|
||||||
produce data sets that conform to all the specifications presented
|
|
||||||
here.
|
|
||||||
|
|
||||||
1.5. Definitions of terms and conventions used
|
|
||||||
|
|
||||||
Byte: 8 bits stored or transmitted as a unit (same as an octet).
|
|
||||||
For this specification, a byte is exactly 8 bits, even on machines
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 3]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
which store a character on a number of bits different from eight.
|
|
||||||
See below, for the numbering of bits within a byte.
|
|
||||||
|
|
||||||
String: a sequence of arbitrary bytes.
|
|
||||||
|
|
||||||
1.6. Changes from previous versions
|
|
||||||
|
|
||||||
There have been no technical changes to the deflate format since
|
|
||||||
version 1.1 of this specification. In version 1.2, some
|
|
||||||
terminology was changed. Version 1.3 is a conversion of the
|
|
||||||
specification to RFC style.
|
|
||||||
|
|
||||||
2. Compressed representation overview
|
|
||||||
|
|
||||||
A compressed data set consists of a series of blocks, corresponding
|
|
||||||
to successive blocks of input data. The block sizes are arbitrary,
|
|
||||||
except that non-compressible blocks are limited to 65,535 bytes.
|
|
||||||
|
|
||||||
Each block is compressed using a combination of the LZ77 algorithm
|
|
||||||
and Huffman coding. The Huffman trees for each block are independent
|
|
||||||
of those for previous or subsequent blocks; the LZ77 algorithm may
|
|
||||||
use a reference to a duplicated string occurring in a previous block,
|
|
||||||
up to 32K input bytes before.
|
|
||||||
|
|
||||||
Each block consists of two parts: a pair of Huffman code trees that
|
|
||||||
describe the representation of the compressed data part, and a
|
|
||||||
compressed data part. (The Huffman trees themselves are compressed
|
|
||||||
using Huffman encoding.) The compressed data consists of a series of
|
|
||||||
elements of two types: literal bytes (of strings that have not been
|
|
||||||
detected as duplicated within the previous 32K input bytes), and
|
|
||||||
pointers to duplicated strings, where a pointer is represented as a
|
|
||||||
pair <length, backward distance>. The representation used in the
|
|
||||||
"deflate" format limits distances to 32K bytes and lengths to 258
|
|
||||||
bytes, but does not limit the size of a block, except for
|
|
||||||
uncompressible blocks, which are limited as noted above.
|
|
||||||
|
|
||||||
Each type of value (literals, distances, and lengths) in the
|
|
||||||
compressed data is represented using a Huffman code, using one code
|
|
||||||
tree for literals and lengths and a separate code tree for distances.
|
|
||||||
The code trees for each block appear in a compact form just before
|
|
||||||
the compressed data for that block.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 4]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
3. Detailed specification
|
|
||||||
|
|
||||||
3.1. Overall conventions In the diagrams below, a box like this:
|
|
||||||
|
|
||||||
+---+
|
|
||||||
| | <-- the vertical bars might be missing
|
|
||||||
+---+
|
|
||||||
|
|
||||||
represents one byte; a box like this:
|
|
||||||
|
|
||||||
+==============+
|
|
||||||
| |
|
|
||||||
+==============+
|
|
||||||
|
|
||||||
represents a variable number of bytes.
|
|
||||||
|
|
||||||
Bytes stored within a computer do not have a "bit order", since
|
|
||||||
they are always treated as a unit. However, a byte considered as
|
|
||||||
an integer between 0 and 255 does have a most- and least-
|
|
||||||
significant bit, and since we write numbers with the most-
|
|
||||||
significant digit on the left, we also write bytes with the most-
|
|
||||||
significant bit on the left. In the diagrams below, we number the
|
|
||||||
bits of a byte so that bit 0 is the least-significant bit, i.e.,
|
|
||||||
the bits are numbered:
|
|
||||||
|
|
||||||
+--------+
|
|
||||||
|76543210|
|
|
||||||
+--------+
|
|
||||||
|
|
||||||
Within a computer, a number may occupy multiple bytes. All
|
|
||||||
multi-byte numbers in the format described here are stored with
|
|
||||||
the least-significant byte first (at the lower memory address).
|
|
||||||
For example, the decimal number 520 is stored as:
|
|
||||||
|
|
||||||
0 1
|
|
||||||
+--------+--------+
|
|
||||||
|00001000|00000010|
|
|
||||||
+--------+--------+
|
|
||||||
^ ^
|
|
||||||
| |
|
|
||||||
| + more significant byte = 2 x 256
|
|
||||||
+ less significant byte = 8
|
|
||||||
|
|
||||||
3.1.1. Packing into bytes
|
|
||||||
|
|
||||||
This document does not address the issue of the order in which
|
|
||||||
bits of a byte are transmitted on a bit-sequential medium,
|
|
||||||
since the final data format described here is byte- rather than
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 5]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
bit-oriented. However, we describe the compressed block format
|
|
||||||
in below, as a sequence of data elements of various bit
|
|
||||||
lengths, not a sequence of bytes. We must therefore specify
|
|
||||||
how to pack these data elements into bytes to form the final
|
|
||||||
compressed byte sequence:
|
|
||||||
|
|
||||||
* Data elements are packed into bytes in order of
|
|
||||||
increasing bit number within the byte, i.e., starting
|
|
||||||
with the least-significant bit of the byte.
|
|
||||||
* Data elements other than Huffman codes are packed
|
|
||||||
starting with the least-significant bit of the data
|
|
||||||
element.
|
|
||||||
* Huffman codes are packed starting with the most-
|
|
||||||
significant bit of the code.
|
|
||||||
|
|
||||||
In other words, if one were to print out the compressed data as
|
|
||||||
a sequence of bytes, starting with the first byte at the
|
|
||||||
*right* margin and proceeding to the *left*, with the most-
|
|
||||||
significant bit of each byte on the left as usual, one would be
|
|
||||||
able to parse the result from right to left, with fixed-width
|
|
||||||
elements in the correct MSB-to-LSB order and Huffman codes in
|
|
||||||
bit-reversed order (i.e., with the first bit of the code in the
|
|
||||||
relative LSB position).
|
|
||||||
|
|
||||||
3.2. Compressed block format
|
|
||||||
|
|
||||||
3.2.1. Synopsis of prefix and Huffman coding
|
|
||||||
|
|
||||||
Prefix coding represents symbols from an a priori known
|
|
||||||
alphabet by bit sequences (codes), one code for each symbol, in
|
|
||||||
a manner such that different symbols may be represented by bit
|
|
||||||
sequences of different lengths, but a parser can always parse
|
|
||||||
an encoded string unambiguously symbol-by-symbol.
|
|
||||||
|
|
||||||
We define a prefix code in terms of a binary tree in which the
|
|
||||||
two edges descending from each non-leaf node are labeled 0 and
|
|
||||||
1 and in which the leaf nodes correspond one-for-one with (are
|
|
||||||
labeled with) the symbols of the alphabet; then the code for a
|
|
||||||
symbol is the sequence of 0's and 1's on the edges leading from
|
|
||||||
the root to the leaf labeled with that symbol. For example:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 6]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
/\ Symbol Code
|
|
||||||
0 1 ------ ----
|
|
||||||
/ \ A 00
|
|
||||||
/\ B B 1
|
|
||||||
0 1 C 011
|
|
||||||
/ \ D 010
|
|
||||||
A /\
|
|
||||||
0 1
|
|
||||||
/ \
|
|
||||||
D C
|
|
||||||
|
|
||||||
A parser can decode the next symbol from an encoded input
|
|
||||||
stream by walking down the tree from the root, at each step
|
|
||||||
choosing the edge corresponding to the next input bit.
|
|
||||||
|
|
||||||
Given an alphabet with known symbol frequencies, the Huffman
|
|
||||||
algorithm allows the construction of an optimal prefix code
|
|
||||||
(one which represents strings with those symbol frequencies
|
|
||||||
using the fewest bits of any possible prefix codes for that
|
|
||||||
alphabet). Such a code is called a Huffman code. (See
|
|
||||||
reference [1] in Chapter 5, references for additional
|
|
||||||
information on Huffman codes.)
|
|
||||||
|
|
||||||
Note that in the "deflate" format, the Huffman codes for the
|
|
||||||
various alphabets must not exceed certain maximum code lengths.
|
|
||||||
This constraint complicates the algorithm for computing code
|
|
||||||
lengths from symbol frequencies. Again, see Chapter 5,
|
|
||||||
references for details.
|
|
||||||
|
|
||||||
3.2.2. Use of Huffman coding in the "deflate" format
|
|
||||||
|
|
||||||
The Huffman codes used for each alphabet in the "deflate"
|
|
||||||
format have two additional rules:
|
|
||||||
|
|
||||||
* All codes of a given bit length have lexicographically
|
|
||||||
consecutive values, in the same order as the symbols
|
|
||||||
they represent;
|
|
||||||
|
|
||||||
* Shorter codes lexicographically precede longer codes.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 7]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
We could recode the example above to follow this rule as
|
|
||||||
follows, assuming that the order of the alphabet is ABCD:
|
|
||||||
|
|
||||||
Symbol Code
|
|
||||||
------ ----
|
|
||||||
A 10
|
|
||||||
B 0
|
|
||||||
C 110
|
|
||||||
D 111
|
|
||||||
|
|
||||||
I.e., 0 precedes 10 which precedes 11x, and 110 and 111 are
|
|
||||||
lexicographically consecutive.
|
|
||||||
|
|
||||||
Given this rule, we can define the Huffman code for an alphabet
|
|
||||||
just by giving the bit lengths of the codes for each symbol of
|
|
||||||
the alphabet in order; this is sufficient to determine the
|
|
||||||
actual codes. In our example, the code is completely defined
|
|
||||||
by the sequence of bit lengths (2, 1, 3, 3). The following
|
|
||||||
algorithm generates the codes as integers, intended to be read
|
|
||||||
from most- to least-significant bit. The code lengths are
|
|
||||||
initially in tree[I].Len; the codes are produced in
|
|
||||||
tree[I].Code.
|
|
||||||
|
|
||||||
1) Count the number of codes for each code length. Let
|
|
||||||
bl_count[N] be the number of codes of length N, N >= 1.
|
|
||||||
|
|
||||||
2) Find the numerical value of the smallest code for each
|
|
||||||
code length:
|
|
||||||
|
|
||||||
code = 0;
|
|
||||||
bl_count[0] = 0;
|
|
||||||
for (bits = 1; bits <= MAX_BITS; bits++) {
|
|
||||||
code = (code + bl_count[bits-1]) << 1;
|
|
||||||
next_code[bits] = code;
|
|
||||||
}
|
|
||||||
|
|
||||||
3) Assign numerical values to all codes, using consecutive
|
|
||||||
values for all codes of the same length with the base
|
|
||||||
values determined at step 2. Codes that are never used
|
|
||||||
(which have a bit length of zero) must not be assigned a
|
|
||||||
value.
|
|
||||||
|
|
||||||
for (n = 0; n <= max_code; n++) {
|
|
||||||
len = tree[n].Len;
|
|
||||||
if (len != 0) {
|
|
||||||
tree[n].Code = next_code[len];
|
|
||||||
next_code[len]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 8]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
Consider the alphabet ABCDEFGH, with bit lengths (3, 3, 3, 3,
|
|
||||||
3, 2, 4, 4). After step 1, we have:
|
|
||||||
|
|
||||||
N bl_count[N]
|
|
||||||
- -----------
|
|
||||||
2 1
|
|
||||||
3 5
|
|
||||||
4 2
|
|
||||||
|
|
||||||
Step 2 computes the following next_code values:
|
|
||||||
|
|
||||||
N next_code[N]
|
|
||||||
- ------------
|
|
||||||
1 0
|
|
||||||
2 0
|
|
||||||
3 2
|
|
||||||
4 14
|
|
||||||
|
|
||||||
Step 3 produces the following code values:
|
|
||||||
|
|
||||||
Symbol Length Code
|
|
||||||
------ ------ ----
|
|
||||||
A 3 010
|
|
||||||
B 3 011
|
|
||||||
C 3 100
|
|
||||||
D 3 101
|
|
||||||
E 3 110
|
|
||||||
F 2 00
|
|
||||||
G 4 1110
|
|
||||||
H 4 1111
|
|
||||||
|
|
||||||
3.2.3. Details of block format
|
|
||||||
|
|
||||||
Each block of compressed data begins with 3 header bits
|
|
||||||
containing the following data:
|
|
||||||
|
|
||||||
first bit BFINAL
|
|
||||||
next 2 bits BTYPE
|
|
||||||
|
|
||||||
Note that the header bits do not necessarily begin on a byte
|
|
||||||
boundary, since a block does not necessarily occupy an integral
|
|
||||||
number of bytes.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 9]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
BFINAL is set if and only if this is the last block of the data
|
|
||||||
set.
|
|
||||||
|
|
||||||
BTYPE specifies how the data are compressed, as follows:
|
|
||||||
|
|
||||||
00 - no compression
|
|
||||||
01 - compressed with fixed Huffman codes
|
|
||||||
10 - compressed with dynamic Huffman codes
|
|
||||||
11 - reserved (error)
|
|
||||||
|
|
||||||
The only difference between the two compressed cases is how the
|
|
||||||
Huffman codes for the literal/length and distance alphabets are
|
|
||||||
defined.
|
|
||||||
|
|
||||||
In all cases, the decoding algorithm for the actual data is as
|
|
||||||
follows:
|
|
||||||
|
|
||||||
do
|
|
||||||
read block header from input stream.
|
|
||||||
if stored with no compression
|
|
||||||
skip any remaining bits in current partially
|
|
||||||
processed byte
|
|
||||||
read LEN and NLEN (see next section)
|
|
||||||
copy LEN bytes of data to output
|
|
||||||
otherwise
|
|
||||||
if compressed with dynamic Huffman codes
|
|
||||||
read representation of code trees (see
|
|
||||||
subsection below)
|
|
||||||
loop (until end of block code recognized)
|
|
||||||
decode literal/length value from input stream
|
|
||||||
if value < 256
|
|
||||||
copy value (literal byte) to output stream
|
|
||||||
otherwise
|
|
||||||
if value = end of block (256)
|
|
||||||
break from loop
|
|
||||||
otherwise (value = 257..285)
|
|
||||||
decode distance from input stream
|
|
||||||
|
|
||||||
move backwards distance bytes in the output
|
|
||||||
stream, and copy length bytes from this
|
|
||||||
position to the output stream.
|
|
||||||
end loop
|
|
||||||
while not last block
|
|
||||||
|
|
||||||
Note that a duplicated string reference may refer to a string
|
|
||||||
in a previous block; i.e., the backward distance may cross one
|
|
||||||
or more block boundaries. However a distance cannot refer past
|
|
||||||
the beginning of the output stream. (An application using a
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 10]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
preset dictionary might discard part of the output stream; a
|
|
||||||
distance can refer to that part of the output stream anyway)
|
|
||||||
Note also that the referenced string may overlap the current
|
|
||||||
position; for example, if the last 2 bytes decoded have values
|
|
||||||
X and Y, a string reference with <length = 5, distance = 2>
|
|
||||||
adds X,Y,X,Y,X to the output stream.
|
|
||||||
|
|
||||||
We now specify each compression method in turn.
|
|
||||||
|
|
||||||
3.2.4. Non-compressed blocks (BTYPE=00)
|
|
||||||
|
|
||||||
Any bits of input up to the next byte boundary are ignored.
|
|
||||||
The rest of the block consists of the following information:
|
|
||||||
|
|
||||||
0 1 2 3 4...
|
|
||||||
+---+---+---+---+================================+
|
|
||||||
| LEN | NLEN |... LEN bytes of literal data...|
|
|
||||||
+---+---+---+---+================================+
|
|
||||||
|
|
||||||
LEN is the number of data bytes in the block. NLEN is the
|
|
||||||
one's complement of LEN.
|
|
||||||
|
|
||||||
3.2.5. Compressed blocks (length and distance codes)
|
|
||||||
|
|
||||||
As noted above, encoded data blocks in the "deflate" format
|
|
||||||
consist of sequences of symbols drawn from three conceptually
|
|
||||||
distinct alphabets: either literal bytes, from the alphabet of
|
|
||||||
byte values (0..255), or <length, backward distance> pairs,
|
|
||||||
where the length is drawn from (3..258) and the distance is
|
|
||||||
drawn from (1..32,768). In fact, the literal and length
|
|
||||||
alphabets are merged into a single alphabet (0..285), where
|
|
||||||
values 0..255 represent literal bytes, the value 256 indicates
|
|
||||||
end-of-block, and values 257..285 represent length codes
|
|
||||||
(possibly in conjunction with extra bits following the symbol
|
|
||||||
code) as follows:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 11]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
Extra Extra Extra
|
|
||||||
Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
|
|
||||||
---- ---- ------ ---- ---- ------- ---- ---- -------
|
|
||||||
257 0 3 267 1 15,16 277 4 67-82
|
|
||||||
258 0 4 268 1 17,18 278 4 83-98
|
|
||||||
259 0 5 269 2 19-22 279 4 99-114
|
|
||||||
260 0 6 270 2 23-26 280 4 115-130
|
|
||||||
261 0 7 271 2 27-30 281 5 131-162
|
|
||||||
262 0 8 272 2 31-34 282 5 163-194
|
|
||||||
263 0 9 273 3 35-42 283 5 195-226
|
|
||||||
264 0 10 274 3 43-50 284 5 227-257
|
|
||||||
265 1 11,12 275 3 51-58 285 0 258
|
|
||||||
266 1 13,14 276 3 59-66
|
|
||||||
|
|
||||||
The extra bits should be interpreted as a machine integer
|
|
||||||
stored with the most-significant bit first, e.g., bits 1110
|
|
||||||
represent the value 14.
|
|
||||||
|
|
||||||
Extra Extra Extra
|
|
||||||
Code Bits Dist Code Bits Dist Code Bits Distance
|
|
||||||
---- ---- ---- ---- ---- ------ ---- ---- --------
|
|
||||||
0 0 1 10 4 33-48 20 9 1025-1536
|
|
||||||
1 0 2 11 4 49-64 21 9 1537-2048
|
|
||||||
2 0 3 12 5 65-96 22 10 2049-3072
|
|
||||||
3 0 4 13 5 97-128 23 10 3073-4096
|
|
||||||
4 1 5,6 14 6 129-192 24 11 4097-6144
|
|
||||||
5 1 7,8 15 6 193-256 25 11 6145-8192
|
|
||||||
6 2 9-12 16 7 257-384 26 12 8193-12288
|
|
||||||
7 2 13-16 17 7 385-512 27 12 12289-16384
|
|
||||||
8 3 17-24 18 8 513-768 28 13 16385-24576
|
|
||||||
9 3 25-32 19 8 769-1024 29 13 24577-32768
|
|
||||||
|
|
||||||
3.2.6. Compression with fixed Huffman codes (BTYPE=01)
|
|
||||||
|
|
||||||
The Huffman codes for the two alphabets are fixed, and are not
|
|
||||||
represented explicitly in the data. The Huffman code lengths
|
|
||||||
for the literal/length alphabet are:
|
|
||||||
|
|
||||||
Lit Value Bits Codes
|
|
||||||
--------- ---- -----
|
|
||||||
0 - 143 8 00110000 through
|
|
||||||
10111111
|
|
||||||
144 - 255 9 110010000 through
|
|
||||||
111111111
|
|
||||||
256 - 279 7 0000000 through
|
|
||||||
0010111
|
|
||||||
280 - 287 8 11000000 through
|
|
||||||
11000111
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 12]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
The code lengths are sufficient to generate the actual codes,
|
|
||||||
as described above; we show the codes in the table for added
|
|
||||||
clarity. Literal/length values 286-287 will never actually
|
|
||||||
occur in the compressed data, but participate in the code
|
|
||||||
construction.
|
|
||||||
|
|
||||||
Distance codes 0-31 are represented by (fixed-length) 5-bit
|
|
||||||
codes, with possible additional bits as shown in the table
|
|
||||||
shown in Paragraph 3.2.5, above. Note that distance codes 30-
|
|
||||||
31 will never actually occur in the compressed data.
|
|
||||||
|
|
||||||
3.2.7. Compression with dynamic Huffman codes (BTYPE=10)
|
|
||||||
|
|
||||||
The Huffman codes for the two alphabets appear in the block
|
|
||||||
immediately after the header bits and before the actual
|
|
||||||
compressed data, first the literal/length code and then the
|
|
||||||
distance code. Each code is defined by a sequence of code
|
|
||||||
lengths, as discussed in Paragraph 3.2.2, above. For even
|
|
||||||
greater compactness, the code length sequences themselves are
|
|
||||||
compressed using a Huffman code. The alphabet for code lengths
|
|
||||||
is as follows:
|
|
||||||
|
|
||||||
0 - 15: Represent code lengths of 0 - 15
|
|
||||||
16: Copy the previous code length 3 - 6 times.
|
|
||||||
The next 2 bits indicate repeat length
|
|
||||||
(0 = 3, ... , 3 = 6)
|
|
||||||
Example: Codes 8, 16 (+2 bits 11),
|
|
||||||
16 (+2 bits 10) will expand to
|
|
||||||
12 code lengths of 8 (1 + 6 + 5)
|
|
||||||
17: Repeat a code length of 0 for 3 - 10 times.
|
|
||||||
(3 bits of length)
|
|
||||||
18: Repeat a code length of 0 for 11 - 138 times
|
|
||||||
(7 bits of length)
|
|
||||||
|
|
||||||
A code length of 0 indicates that the corresponding symbol in
|
|
||||||
the literal/length or distance alphabet will not occur in the
|
|
||||||
block, and should not participate in the Huffman code
|
|
||||||
construction algorithm given earlier. If only one distance
|
|
||||||
code is used, it is encoded using one bit, not zero bits; in
|
|
||||||
this case there is a single code length of one, with one unused
|
|
||||||
code. One distance code of zero bits means that there are no
|
|
||||||
distance codes used at all (the data is all literals).
|
|
||||||
|
|
||||||
We can now define the format of the block:
|
|
||||||
|
|
||||||
5 Bits: HLIT, # of Literal/Length codes - 257 (257 - 286)
|
|
||||||
5 Bits: HDIST, # of Distance codes - 1 (1 - 32)
|
|
||||||
4 Bits: HCLEN, # of Code Length codes - 4 (4 - 19)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 13]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
(HCLEN + 4) x 3 bits: code lengths for the code length
|
|
||||||
alphabet given just above, in the order: 16, 17, 18,
|
|
||||||
0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
|
||||||
|
|
||||||
These code lengths are interpreted as 3-bit integers
|
|
||||||
(0-7); as above, a code length of 0 means the
|
|
||||||
corresponding symbol (literal/length or distance code
|
|
||||||
length) is not used.
|
|
||||||
|
|
||||||
HLIT + 257 code lengths for the literal/length alphabet,
|
|
||||||
encoded using the code length Huffman code
|
|
||||||
|
|
||||||
HDIST + 1 code lengths for the distance alphabet,
|
|
||||||
encoded using the code length Huffman code
|
|
||||||
|
|
||||||
The actual compressed data of the block,
|
|
||||||
encoded using the literal/length and distance Huffman
|
|
||||||
codes
|
|
||||||
|
|
||||||
The literal/length symbol 256 (end of data),
|
|
||||||
encoded using the literal/length Huffman code
|
|
||||||
|
|
||||||
The code length repeat codes can cross from HLIT + 257 to the
|
|
||||||
HDIST + 1 code lengths. In other words, all code lengths form
|
|
||||||
a single sequence of HLIT + HDIST + 258 values.
|
|
||||||
|
|
||||||
3.3. Compliance
|
|
||||||
|
|
||||||
A compressor may limit further the ranges of values specified in
|
|
||||||
the previous section and still be compliant; for example, it may
|
|
||||||
limit the range of backward pointers to some value smaller than
|
|
||||||
32K. Similarly, a compressor may limit the size of blocks so that
|
|
||||||
a compressible block fits in memory.
|
|
||||||
|
|
||||||
A compliant decompressor must accept the full range of possible
|
|
||||||
values defined in the previous section, and must accept blocks of
|
|
||||||
arbitrary size.
|
|
||||||
|
|
||||||
4. Compression algorithm details
|
|
||||||
|
|
||||||
While it is the intent of this document to define the "deflate"
|
|
||||||
compressed data format without reference to any particular
|
|
||||||
compression algorithm, the format is related to the compressed
|
|
||||||
formats produced by LZ77 (Lempel-Ziv 1977, see reference [2] below);
|
|
||||||
since many variations of LZ77 are patented, it is strongly
|
|
||||||
recommended that the implementor of a compressor follow the general
|
|
||||||
algorithm presented here, which is known not to be patented per se.
|
|
||||||
The material in this section is not part of the definition of the
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 14]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
specification per se, and a compressor need not follow it in order to
|
|
||||||
be compliant.
|
|
||||||
|
|
||||||
The compressor terminates a block when it determines that starting a
|
|
||||||
new block with fresh trees would be useful, or when the block size
|
|
||||||
fills up the compressor's block buffer.
|
|
||||||
|
|
||||||
The compressor uses a chained hash table to find duplicated strings,
|
|
||||||
using a hash function that operates on 3-byte sequences. At any
|
|
||||||
given point during compression, let XYZ be the next 3 input bytes to
|
|
||||||
be examined (not necessarily all different, of course). First, the
|
|
||||||
compressor examines the hash chain for XYZ. If the chain is empty,
|
|
||||||
the compressor simply writes out X as a literal byte and advances one
|
|
||||||
byte in the input. If the hash chain is not empty, indicating that
|
|
||||||
the sequence XYZ (or, if we are unlucky, some other 3 bytes with the
|
|
||||||
same hash function value) has occurred recently, the compressor
|
|
||||||
compares all strings on the XYZ hash chain with the actual input data
|
|
||||||
sequence starting at the current point, and selects the longest
|
|
||||||
match.
|
|
||||||
|
|
||||||
The compressor searches the hash chains starting with the most recent
|
|
||||||
strings, to favor small distances and thus take advantage of the
|
|
||||||
Huffman encoding. The hash chains are singly linked. There are no
|
|
||||||
deletions from the hash chains; the algorithm simply discards matches
|
|
||||||
that are too old. To avoid a worst-case situation, very long hash
|
|
||||||
chains are arbitrarily truncated at a certain length, determined by a
|
|
||||||
run-time parameter.
|
|
||||||
|
|
||||||
To improve overall compression, the compressor optionally defers the
|
|
||||||
selection of matches ("lazy matching"): after a match of length N has
|
|
||||||
been found, the compressor searches for a longer match starting at
|
|
||||||
the next input byte. If it finds a longer match, it truncates the
|
|
||||||
previous match to a length of one (thus producing a single literal
|
|
||||||
byte) and then emits the longer match. Otherwise, it emits the
|
|
||||||
original match, and, as described above, advances N bytes before
|
|
||||||
continuing.
|
|
||||||
|
|
||||||
Run-time parameters also control this "lazy match" procedure. If
|
|
||||||
compression ratio is most important, the compressor attempts a
|
|
||||||
complete second search regardless of the length of the first match.
|
|
||||||
In the normal case, if the current match is "long enough", the
|
|
||||||
compressor reduces the search for a longer match, thus speeding up
|
|
||||||
the process. If speed is most important, the compressor inserts new
|
|
||||||
strings in the hash table only when no match was found, or when the
|
|
||||||
match is not "too long". This degrades the compression ratio but
|
|
||||||
saves time since there are both fewer insertions and fewer searches.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 15]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
5. References
|
|
||||||
|
|
||||||
[1] Huffman, D. A., "A Method for the Construction of Minimum
|
|
||||||
Redundancy Codes", Proceedings of the Institute of Radio
|
|
||||||
Engineers, September 1952, Volume 40, Number 9, pp. 1098-1101.
|
|
||||||
|
|
||||||
[2] Ziv J., Lempel A., "A Universal Algorithm for Sequential Data
|
|
||||||
Compression", IEEE Transactions on Information Theory, Vol. 23,
|
|
||||||
No. 3, pp. 337-343.
|
|
||||||
|
|
||||||
[3] Gailly, J.-L., and Adler, M., ZLIB documentation and sources,
|
|
||||||
available in ftp://ftp.uu.net/pub/archiving/zip/doc/
|
|
||||||
|
|
||||||
[4] Gailly, J.-L., and Adler, M., GZIP documentation and sources,
|
|
||||||
available as gzip-*.tar in ftp://prep.ai.mit.edu/pub/gnu/
|
|
||||||
|
|
||||||
[5] Schwartz, E. S., and Kallick, B. "Generating a canonical prefix
|
|
||||||
encoding." Comm. ACM, 7,3 (Mar. 1964), pp. 166-169.
|
|
||||||
|
|
||||||
[6] Hirschberg and Lelewer, "Efficient decoding of prefix codes,"
|
|
||||||
Comm. ACM, 33,4, April 1990, pp. 449-459.
|
|
||||||
|
|
||||||
6. Security Considerations
|
|
||||||
|
|
||||||
Any data compression method involves the reduction of redundancy in
|
|
||||||
the data. Consequently, any corruption of the data is likely to have
|
|
||||||
severe effects and be difficult to correct. Uncompressed text, on
|
|
||||||
the other hand, will probably still be readable despite the presence
|
|
||||||
of some corrupted bytes.
|
|
||||||
|
|
||||||
It is recommended that systems using this data format provide some
|
|
||||||
means of validating the integrity of the compressed data. See
|
|
||||||
reference [3], for example.
|
|
||||||
|
|
||||||
7. Source code
|
|
||||||
|
|
||||||
Source code for a C language implementation of a "deflate" compliant
|
|
||||||
compressor and decompressor is available within the zlib package at
|
|
||||||
ftp://ftp.uu.net/pub/archiving/zip/zlib/.
|
|
||||||
|
|
||||||
8. Acknowledgements
|
|
||||||
|
|
||||||
Trademarks cited in this document are the property of their
|
|
||||||
respective owners.
|
|
||||||
|
|
||||||
Phil Katz designed the deflate format. Jean-Loup Gailly and Mark
|
|
||||||
Adler wrote the related software described in this specification.
|
|
||||||
Glenn Randers-Pehrson converted this document to RFC and HTML format.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 16]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
9. Author's Address
|
|
||||||
|
|
||||||
L. Peter Deutsch
|
|
||||||
Aladdin Enterprises
|
|
||||||
203 Santa Margarita Ave.
|
|
||||||
Menlo Park, CA 94025
|
|
||||||
|
|
||||||
Phone: (415) 322-0103 (AM only)
|
|
||||||
FAX: (415) 322-1734
|
|
||||||
EMail: <ghost@aladdin.com>
|
|
||||||
|
|
||||||
Questions about the technical content of this specification can be
|
|
||||||
sent by email to:
|
|
||||||
|
|
||||||
Jean-Loup Gailly <gzip@prep.ai.mit.edu> and
|
|
||||||
Mark Adler <madler@alumni.caltech.edu>
|
|
||||||
|
|
||||||
Editorial comments on this specification can be sent by email to:
|
|
||||||
|
|
||||||
L. Peter Deutsch <ghost@aladdin.com> and
|
|
||||||
Glenn Randers-Pehrson <randeg@alumni.rpi.edu>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 17]
|
|
||||||
|
|
||||||
@ -1,103 +0,0 @@
|
|||||||
// 2 bits: type, can be 0 (literal), 1 (EOF), 2 (Match) or 3 (Unused).
|
|
||||||
// 8 bits: xlength (length - MIN_MATCH_LENGTH).
|
|
||||||
// 22 bits: xoffset (offset - MIN_OFFSET_SIZE), or literal.
|
|
||||||
const length_shift = 22;
|
|
||||||
const offset_mask = (1 << length_shift) - 1; // 4_194_303
|
|
||||||
const literal_type = 0 << 30; // 0
|
|
||||||
pub const match_type = 1 << 30; // 1_073_741_824
|
|
||||||
|
|
||||||
// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
|
|
||||||
// is length_codes[length - MIN_MATCH_LENGTH]
|
|
||||||
var length_codes = [_]u32{
|
|
||||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
|
|
||||||
9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
|
|
||||||
13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
|
|
||||||
15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
|
|
||||||
17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
|
|
||||||
18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
|
|
||||||
19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
|
|
||||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
|
||||||
21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
|
|
||||||
21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
|
|
||||||
22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
|
|
||||||
22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
|
|
||||||
23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
|
|
||||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
|
||||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
|
||||||
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
|
||||||
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
|
||||||
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
|
||||||
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
|
||||||
25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
|
|
||||||
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
|
||||||
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
|
|
||||||
26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
|
|
||||||
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
|
||||||
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
|
||||||
27, 27, 27, 27, 27, 28,
|
|
||||||
};
|
|
||||||
|
|
||||||
var offset_codes = [_]u32{
|
|
||||||
0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
|
|
||||||
8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
|
|
||||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
|
||||||
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
|
||||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
|
||||||
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
|
||||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
|
||||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
|
||||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
|
||||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
|
||||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
|
||||||
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
|
||||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
|
||||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
|
||||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
|
||||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub const Token = u32;
|
|
||||||
|
|
||||||
// Convert a literal into a literal token.
|
|
||||||
pub fn literalToken(lit: u32) Token {
|
|
||||||
return literal_type + lit;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert a < xlength, xoffset > pair into a match token.
|
|
||||||
pub fn matchToken(xlength: u32, xoffset: u32) Token {
|
|
||||||
return match_type + (xlength << length_shift) + xoffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the literal of a literal token
|
|
||||||
pub fn literal(t: Token) u32 {
|
|
||||||
return @as(u32, @intCast(t - literal_type));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the extra offset of a match token
|
|
||||||
pub fn offset(t: Token) u32 {
|
|
||||||
return @as(u32, @intCast(t)) & offset_mask;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn length(t: Token) u32 {
|
|
||||||
return @as(u32, @intCast((t - match_type) >> length_shift));
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn lengthCode(len: u32) u32 {
|
|
||||||
return length_codes[len];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns the offset code corresponding to a specific offset
|
|
||||||
pub fn offsetCode(off: u32) u32 {
|
|
||||||
if (off < @as(u32, @intCast(offset_codes.len))) {
|
|
||||||
return offset_codes[off];
|
|
||||||
}
|
|
||||||
if (off >> 7 < @as(u32, @intCast(offset_codes.len))) {
|
|
||||||
return offset_codes[off >> 7] + 14;
|
|
||||||
}
|
|
||||||
return offset_codes[off >> 14] + 28;
|
|
||||||
}
|
|
||||||
|
|
||||||
test {
|
|
||||||
const std = @import("std");
|
|
||||||
try std.testing.expectEqual(@as(Token, 3_401_581_099), matchToken(555, 555));
|
|
||||||
}
|
|
||||||
@ -1,382 +0,0 @@
|
|||||||
//
|
|
||||||
// Compressor/Decompressor for GZIP data streams (RFC1952)
|
|
||||||
|
|
||||||
const std = @import("../std.zig");
|
|
||||||
const io = std.io;
|
|
||||||
const fs = std.fs;
|
|
||||||
const testing = std.testing;
|
|
||||||
const mem = std.mem;
|
|
||||||
const deflate = @import("deflate.zig");
|
|
||||||
|
|
||||||
const magic = &[2]u8{ 0x1f, 0x8b };
|
|
||||||
|
|
||||||
// Flags for the FLG field in the header
|
|
||||||
const FTEXT = 1 << 0;
|
|
||||||
const FHCRC = 1 << 1;
|
|
||||||
const FEXTRA = 1 << 2;
|
|
||||||
const FNAME = 1 << 3;
|
|
||||||
const FCOMMENT = 1 << 4;
|
|
||||||
|
|
||||||
const max_string_len = 1024;
|
|
||||||
|
|
||||||
pub const Header = struct {
|
|
||||||
extra: ?[]const u8 = null,
|
|
||||||
filename: ?[]const u8 = null,
|
|
||||||
comment: ?[]const u8 = null,
|
|
||||||
modification_time: u32 = 0,
|
|
||||||
operating_system: u8 = 255,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn Decompress(comptime ReaderType: type) type {
|
|
||||||
return struct {
|
|
||||||
const Self = @This();
|
|
||||||
|
|
||||||
pub const Error = ReaderType.Error ||
|
|
||||||
deflate.Decompressor(ReaderType).Error ||
|
|
||||||
error{ CorruptedData, WrongChecksum };
|
|
||||||
pub const Reader = io.Reader(*Self, Error, read);
|
|
||||||
|
|
||||||
allocator: mem.Allocator,
|
|
||||||
inflater: deflate.Decompressor(ReaderType),
|
|
||||||
in_reader: ReaderType,
|
|
||||||
hasher: std.hash.Crc32,
|
|
||||||
read_amt: u32,
|
|
||||||
|
|
||||||
info: Header,
|
|
||||||
|
|
||||||
fn init(allocator: mem.Allocator, in_reader: ReaderType) !Self {
|
|
||||||
var hasher = std.compress.hashedReader(in_reader, std.hash.Crc32.init());
|
|
||||||
const hashed_reader = hasher.reader();
|
|
||||||
|
|
||||||
// gzip header format is specified in RFC1952
|
|
||||||
const header = try hashed_reader.readBytesNoEof(10);
|
|
||||||
|
|
||||||
// Check the ID1/ID2 fields
|
|
||||||
if (!std.mem.eql(u8, header[0..2], magic))
|
|
||||||
return error.BadHeader;
|
|
||||||
|
|
||||||
const CM = header[2];
|
|
||||||
// The CM field must be 8 to indicate the use of DEFLATE
|
|
||||||
if (CM != 8) return error.InvalidCompression;
|
|
||||||
// Flags
|
|
||||||
const FLG = header[3];
|
|
||||||
// Modification time, as a Unix timestamp.
|
|
||||||
// If zero there's no timestamp available.
|
|
||||||
const MTIME = mem.readInt(u32, header[4..8], .little);
|
|
||||||
// Extra flags
|
|
||||||
const XFL = header[8];
|
|
||||||
// Operating system where the compression took place
|
|
||||||
const OS = header[9];
|
|
||||||
_ = XFL;
|
|
||||||
|
|
||||||
const extra = if (FLG & FEXTRA != 0) blk: {
|
|
||||||
const len = try hashed_reader.readInt(u16, .little);
|
|
||||||
const tmp_buf = try allocator.alloc(u8, len);
|
|
||||||
errdefer allocator.free(tmp_buf);
|
|
||||||
|
|
||||||
try hashed_reader.readNoEof(tmp_buf);
|
|
||||||
break :blk tmp_buf;
|
|
||||||
} else null;
|
|
||||||
errdefer if (extra) |p| allocator.free(p);
|
|
||||||
|
|
||||||
const filename = if (FLG & FNAME != 0)
|
|
||||||
try hashed_reader.readUntilDelimiterAlloc(allocator, 0, max_string_len)
|
|
||||||
else
|
|
||||||
null;
|
|
||||||
errdefer if (filename) |p| allocator.free(p);
|
|
||||||
|
|
||||||
const comment = if (FLG & FCOMMENT != 0)
|
|
||||||
try hashed_reader.readUntilDelimiterAlloc(allocator, 0, max_string_len)
|
|
||||||
else
|
|
||||||
null;
|
|
||||||
errdefer if (comment) |p| allocator.free(p);
|
|
||||||
|
|
||||||
if (FLG & FHCRC != 0) {
|
|
||||||
const hash = try in_reader.readInt(u16, .little);
|
|
||||||
if (hash != @as(u16, @truncate(hasher.hasher.final())))
|
|
||||||
return error.WrongChecksum;
|
|
||||||
}
|
|
||||||
|
|
||||||
return .{
|
|
||||||
.allocator = allocator,
|
|
||||||
.inflater = try deflate.decompressor(allocator, in_reader, null),
|
|
||||||
.in_reader = in_reader,
|
|
||||||
.hasher = std.hash.Crc32.init(),
|
|
||||||
.info = .{
|
|
||||||
.filename = filename,
|
|
||||||
.comment = comment,
|
|
||||||
.extra = extra,
|
|
||||||
.modification_time = MTIME,
|
|
||||||
.operating_system = OS,
|
|
||||||
},
|
|
||||||
.read_amt = 0,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *Self) void {
|
|
||||||
self.inflater.deinit();
|
|
||||||
if (self.info.extra) |extra|
|
|
||||||
self.allocator.free(extra);
|
|
||||||
if (self.info.filename) |filename|
|
|
||||||
self.allocator.free(filename);
|
|
||||||
if (self.info.comment) |comment|
|
|
||||||
self.allocator.free(comment);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Implements the io.Reader interface
|
|
||||||
pub fn read(self: *Self, buffer: []u8) Error!usize {
|
|
||||||
if (buffer.len == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// Read from the compressed stream and update the computed checksum
|
|
||||||
const r = try self.inflater.read(buffer);
|
|
||||||
if (r != 0) {
|
|
||||||
self.hasher.update(buffer[0..r]);
|
|
||||||
self.read_amt +%= @truncate(r);
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
try self.inflater.close();
|
|
||||||
|
|
||||||
// We've reached the end of stream, check if the checksum matches
|
|
||||||
const hash = try self.in_reader.readInt(u32, .little);
|
|
||||||
if (hash != self.hasher.final())
|
|
||||||
return error.WrongChecksum;
|
|
||||||
|
|
||||||
// The ISIZE field is the size of the uncompressed input modulo 2^32
|
|
||||||
const input_size = try self.in_reader.readInt(u32, .little);
|
|
||||||
if (self.read_amt != input_size)
|
|
||||||
return error.CorruptedData;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn reader(self: *Self) Reader {
|
|
||||||
return .{ .context = self };
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn decompress(allocator: mem.Allocator, reader: anytype) !Decompress(@TypeOf(reader)) {
|
|
||||||
return Decompress(@TypeOf(reader)).init(allocator, reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const CompressOptions = struct {
|
|
||||||
header: Header = .{},
|
|
||||||
hash_header: bool = true,
|
|
||||||
level: deflate.Compression = .default_compression,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn Compress(comptime WriterType: type) type {
|
|
||||||
return struct {
|
|
||||||
const Self = @This();
|
|
||||||
|
|
||||||
pub const Error = WriterType.Error ||
|
|
||||||
deflate.Compressor(WriterType).Error;
|
|
||||||
pub const Writer = io.Writer(*Self, Error, write);
|
|
||||||
|
|
||||||
allocator: mem.Allocator,
|
|
||||||
deflater: deflate.Compressor(WriterType),
|
|
||||||
out_writer: WriterType,
|
|
||||||
hasher: std.hash.Crc32,
|
|
||||||
write_amt: u32,
|
|
||||||
|
|
||||||
fn init(allocator: mem.Allocator, out_writer: WriterType, options: CompressOptions) !Self {
|
|
||||||
var hasher = std.compress.hashedWriter(out_writer, std.hash.Crc32.init());
|
|
||||||
const hashed_writer = hasher.writer();
|
|
||||||
|
|
||||||
// ID1/ID2
|
|
||||||
try hashed_writer.writeAll(magic);
|
|
||||||
// CM
|
|
||||||
try hashed_writer.writeByte(8);
|
|
||||||
// Flags
|
|
||||||
try hashed_writer.writeByte(
|
|
||||||
@as(u8, if (options.hash_header) FHCRC else 0) |
|
|
||||||
@as(u8, if (options.header.extra) |_| FEXTRA else 0) |
|
|
||||||
@as(u8, if (options.header.filename) |_| FNAME else 0) |
|
|
||||||
@as(u8, if (options.header.comment) |_| FCOMMENT else 0),
|
|
||||||
);
|
|
||||||
// Modification time
|
|
||||||
try hashed_writer.writeInt(u32, options.header.modification_time, .little);
|
|
||||||
// Extra flags
|
|
||||||
try hashed_writer.writeByte(0);
|
|
||||||
// Operating system
|
|
||||||
try hashed_writer.writeByte(options.header.operating_system);
|
|
||||||
|
|
||||||
if (options.header.extra) |extra| {
|
|
||||||
try hashed_writer.writeInt(u16, @intCast(extra.len), .little);
|
|
||||||
try hashed_writer.writeAll(extra);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.header.filename) |filename| {
|
|
||||||
try hashed_writer.writeAll(filename);
|
|
||||||
try hashed_writer.writeByte(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.header.comment) |comment| {
|
|
||||||
try hashed_writer.writeAll(comment);
|
|
||||||
try hashed_writer.writeByte(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.hash_header) {
|
|
||||||
try out_writer.writeInt(
|
|
||||||
u16,
|
|
||||||
@truncate(hasher.hasher.final()),
|
|
||||||
.little,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return .{
|
|
||||||
.allocator = allocator,
|
|
||||||
.deflater = try deflate.compressor(allocator, out_writer, .{ .level = options.level }),
|
|
||||||
.out_writer = out_writer,
|
|
||||||
.hasher = std.hash.Crc32.init(),
|
|
||||||
.write_amt = 0,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *Self) void {
|
|
||||||
self.deflater.deinit();
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Implements the io.Writer interface
|
|
||||||
pub fn write(self: *Self, buffer: []const u8) Error!usize {
|
|
||||||
if (buffer.len == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// Write to the compressed stream and update the computed checksum
|
|
||||||
const r = try self.deflater.write(buffer);
|
|
||||||
self.hasher.update(buffer[0..r]);
|
|
||||||
self.write_amt +%= @truncate(r);
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn writer(self: *Self) Writer {
|
|
||||||
return .{ .context = self };
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn flush(self: *Self) Error!void {
|
|
||||||
try self.deflater.flush();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn close(self: *Self) Error!void {
|
|
||||||
try self.deflater.close();
|
|
||||||
try self.out_writer.writeInt(u32, self.hasher.final(), .little);
|
|
||||||
try self.out_writer.writeInt(u32, self.write_amt, .little);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn compress(allocator: mem.Allocator, writer: anytype, options: CompressOptions) !Compress(@TypeOf(writer)) {
|
|
||||||
return Compress(@TypeOf(writer)).init(allocator, writer, options);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn testReader(expected: []const u8, data: []const u8) !void {
|
|
||||||
var in_stream = io.fixedBufferStream(data);
|
|
||||||
|
|
||||||
var gzip_stream = try decompress(testing.allocator, in_stream.reader());
|
|
||||||
defer gzip_stream.deinit();
|
|
||||||
|
|
||||||
// Read and decompress the whole file
|
|
||||||
const buf = try gzip_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize));
|
|
||||||
defer testing.allocator.free(buf);
|
|
||||||
|
|
||||||
// Check against the reference
|
|
||||||
try testing.expectEqualSlices(u8, expected, buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn testWriter(expected: []const u8, data: []const u8, options: CompressOptions) !void {
|
|
||||||
var actual = std.ArrayList(u8).init(testing.allocator);
|
|
||||||
defer actual.deinit();
|
|
||||||
|
|
||||||
var gzip_stream = try compress(testing.allocator, actual.writer(), options);
|
|
||||||
defer gzip_stream.deinit();
|
|
||||||
|
|
||||||
// Write and compress the whole file
|
|
||||||
try gzip_stream.writer().writeAll(data);
|
|
||||||
try gzip_stream.close();
|
|
||||||
|
|
||||||
// Check against the reference
|
|
||||||
try testing.expectEqualSlices(u8, expected, actual.items);
|
|
||||||
}
|
|
||||||
|
|
||||||
// All the test cases are obtained by compressing the RFC1952 text
|
|
||||||
//
|
|
||||||
// https://tools.ietf.org/rfc/rfc1952.txt length=25037 bytes
|
|
||||||
// SHA256=164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67
|
|
||||||
test "compressed data" {
|
|
||||||
const plain = @embedFile("testdata/rfc1952.txt");
|
|
||||||
const compressed = @embedFile("testdata/rfc1952.txt.gz");
|
|
||||||
try testReader(plain, compressed);
|
|
||||||
try testWriter(compressed, plain, .{
|
|
||||||
.header = .{
|
|
||||||
.filename = "rfc1952.txt",
|
|
||||||
.modification_time = 1706533053,
|
|
||||||
.operating_system = 3,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
test "sanity checks" {
|
|
||||||
// Truncated header
|
|
||||||
try testing.expectError(
|
|
||||||
error.EndOfStream,
|
|
||||||
testReader(undefined, &[_]u8{ 0x1f, 0x8B }),
|
|
||||||
);
|
|
||||||
// Wrong CM
|
|
||||||
try testing.expectError(
|
|
||||||
error.InvalidCompression,
|
|
||||||
testReader(undefined, &[_]u8{
|
|
||||||
0x1f, 0x8b, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x03,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
// Wrong checksum
|
|
||||||
try testing.expectError(
|
|
||||||
error.WrongChecksum,
|
|
||||||
testReader(undefined, &[_]u8{
|
|
||||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01,
|
|
||||||
0x00, 0x00, 0x00, 0x00,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
// Truncated checksum
|
|
||||||
try testing.expectError(
|
|
||||||
error.EndOfStream,
|
|
||||||
testReader(undefined, &[_]u8{
|
|
||||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
// Wrong initial size
|
|
||||||
try testing.expectError(
|
|
||||||
error.CorruptedData,
|
|
||||||
testReader(undefined, &[_]u8{
|
|
||||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x00, 0x00, 0x01,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
// Truncated initial size field
|
|
||||||
try testing.expectError(
|
|
||||||
error.EndOfStream,
|
|
||||||
testReader(undefined, &[_]u8{
|
|
||||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
0x00, 0x00, 0x00,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "header checksum" {
|
|
||||||
try testReader("", &[_]u8{
|
|
||||||
// GZIP header
|
|
||||||
0x1f, 0x8b, 0x08, 0x12, 0x00, 0x09, 0x6e, 0x88, 0x00, 0xff, 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x00,
|
|
||||||
|
|
||||||
// header.FHCRC (should cover entire header)
|
|
||||||
0x99, 0xd6,
|
|
||||||
|
|
||||||
// GZIP data
|
|
||||||
0x01, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
955
lib/std/compress/testdata/rfc1951.txt
vendored
955
lib/std/compress/testdata/rfc1951.txt
vendored
@ -1,955 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Network Working Group P. Deutsch
|
|
||||||
Request for Comments: 1951 Aladdin Enterprises
|
|
||||||
Category: Informational May 1996
|
|
||||||
|
|
||||||
|
|
||||||
DEFLATE Compressed Data Format Specification version 1.3
|
|
||||||
|
|
||||||
Status of This Memo
|
|
||||||
|
|
||||||
This memo provides information for the Internet community. This memo
|
|
||||||
does not specify an Internet standard of any kind. Distribution of
|
|
||||||
this memo is unlimited.
|
|
||||||
|
|
||||||
IESG Note:
|
|
||||||
|
|
||||||
The IESG takes no position on the validity of any Intellectual
|
|
||||||
Property Rights statements contained in this document.
|
|
||||||
|
|
||||||
Notices
|
|
||||||
|
|
||||||
Copyright (c) 1996 L. Peter Deutsch
|
|
||||||
|
|
||||||
Permission is granted to copy and distribute this document for any
|
|
||||||
purpose and without charge, including translations into other
|
|
||||||
languages and incorporation into compilations, provided that the
|
|
||||||
copyright notice and this notice are preserved, and that any
|
|
||||||
substantive changes or deletions from the original are clearly
|
|
||||||
marked.
|
|
||||||
|
|
||||||
A pointer to the latest version of this and related documentation in
|
|
||||||
HTML format can be found at the URL
|
|
||||||
<ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>.
|
|
||||||
|
|
||||||
Abstract
|
|
||||||
|
|
||||||
This specification defines a lossless compressed data format that
|
|
||||||
compresses data using a combination of the LZ77 algorithm and Huffman
|
|
||||||
coding, with efficiency comparable to the best currently available
|
|
||||||
general-purpose compression methods. The data can be produced or
|
|
||||||
consumed, even for an arbitrarily long sequentially presented input
|
|
||||||
data stream, using only an a priori bounded amount of intermediate
|
|
||||||
storage. The format can be implemented readily in a manner not
|
|
||||||
covered by patents.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 1]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
Table of Contents
|
|
||||||
|
|
||||||
1. Introduction ................................................... 2
|
|
||||||
1.1. Purpose ................................................... 2
|
|
||||||
1.2. Intended audience ......................................... 3
|
|
||||||
1.3. Scope ..................................................... 3
|
|
||||||
1.4. Compliance ................................................ 3
|
|
||||||
1.5. Definitions of terms and conventions used ................ 3
|
|
||||||
1.6. Changes from previous versions ............................ 4
|
|
||||||
2. Compressed representation overview ............................. 4
|
|
||||||
3. Detailed specification ......................................... 5
|
|
||||||
3.1. Overall conventions ....................................... 5
|
|
||||||
3.1.1. Packing into bytes .................................. 5
|
|
||||||
3.2. Compressed block format ................................... 6
|
|
||||||
3.2.1. Synopsis of prefix and Huffman coding ............... 6
|
|
||||||
3.2.2. Use of Huffman coding in the "deflate" format ....... 7
|
|
||||||
3.2.3. Details of block format ............................. 9
|
|
||||||
3.2.4. Non-compressed blocks (BTYPE=00) ................... 11
|
|
||||||
3.2.5. Compressed blocks (length and distance codes) ...... 11
|
|
||||||
3.2.6. Compression with fixed Huffman codes (BTYPE=01) .... 12
|
|
||||||
3.2.7. Compression with dynamic Huffman codes (BTYPE=10) .. 13
|
|
||||||
3.3. Compliance ............................................... 14
|
|
||||||
4. Compression algorithm details ................................. 14
|
|
||||||
5. References .................................................... 16
|
|
||||||
6. Security Considerations ....................................... 16
|
|
||||||
7. Source code ................................................... 16
|
|
||||||
8. Acknowledgements .............................................. 16
|
|
||||||
9. Author's Address .............................................. 17
|
|
||||||
|
|
||||||
1. Introduction
|
|
||||||
|
|
||||||
1.1. Purpose
|
|
||||||
|
|
||||||
The purpose of this specification is to define a lossless
|
|
||||||
compressed data format that:
|
|
||||||
* Is independent of CPU type, operating system, file system,
|
|
||||||
and character set, and hence can be used for interchange;
|
|
||||||
* Can be produced or consumed, even for an arbitrarily long
|
|
||||||
sequentially presented input data stream, using only an a
|
|
||||||
priori bounded amount of intermediate storage, and hence
|
|
||||||
can be used in data communications or similar structures
|
|
||||||
such as Unix filters;
|
|
||||||
* Compresses data with efficiency comparable to the best
|
|
||||||
currently available general-purpose compression methods,
|
|
||||||
and in particular considerably better than the "compress"
|
|
||||||
program;
|
|
||||||
* Can be implemented readily in a manner not covered by
|
|
||||||
patents, and hence can be practiced freely;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 2]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
* Is compatible with the file format produced by the current
|
|
||||||
widely used gzip utility, in that conforming decompressors
|
|
||||||
will be able to read data produced by the existing gzip
|
|
||||||
compressor.
|
|
||||||
|
|
||||||
The data format defined by this specification does not attempt to:
|
|
||||||
|
|
||||||
* Allow random access to compressed data;
|
|
||||||
* Compress specialized data (e.g., raster graphics) as well
|
|
||||||
as the best currently available specialized algorithms.
|
|
||||||
|
|
||||||
A simple counting argument shows that no lossless compression
|
|
||||||
algorithm can compress every possible input data set. For the
|
|
||||||
format defined here, the worst case expansion is 5 bytes per 32K-
|
|
||||||
byte block, i.e., a size increase of 0.015% for large data sets.
|
|
||||||
English text usually compresses by a factor of 2.5 to 3;
|
|
||||||
executable files usually compress somewhat less; graphical data
|
|
||||||
such as raster images may compress much more.
|
|
||||||
|
|
||||||
1.2. Intended audience
|
|
||||||
|
|
||||||
This specification is intended for use by implementors of software
|
|
||||||
to compress data into "deflate" format and/or decompress data from
|
|
||||||
"deflate" format.
|
|
||||||
|
|
||||||
The text of the specification assumes a basic background in
|
|
||||||
programming at the level of bits and other primitive data
|
|
||||||
representations. Familiarity with the technique of Huffman coding
|
|
||||||
is helpful but not required.
|
|
||||||
|
|
||||||
1.3. Scope
|
|
||||||
|
|
||||||
The specification specifies a method for representing a sequence
|
|
||||||
of bytes as a (usually shorter) sequence of bits, and a method for
|
|
||||||
packing the latter bit sequence into bytes.
|
|
||||||
|
|
||||||
1.4. Compliance
|
|
||||||
|
|
||||||
Unless otherwise indicated below, a compliant decompressor must be
|
|
||||||
able to accept and decompress any data set that conforms to all
|
|
||||||
the specifications presented here; a compliant compressor must
|
|
||||||
produce data sets that conform to all the specifications presented
|
|
||||||
here.
|
|
||||||
|
|
||||||
1.5. Definitions of terms and conventions used
|
|
||||||
|
|
||||||
Byte: 8 bits stored or transmitted as a unit (same as an octet).
|
|
||||||
For this specification, a byte is exactly 8 bits, even on machines
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 3]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
which store a character on a number of bits different from eight.
|
|
||||||
See below, for the numbering of bits within a byte.
|
|
||||||
|
|
||||||
String: a sequence of arbitrary bytes.
|
|
||||||
|
|
||||||
1.6. Changes from previous versions
|
|
||||||
|
|
||||||
There have been no technical changes to the deflate format since
|
|
||||||
version 1.1 of this specification. In version 1.2, some
|
|
||||||
terminology was changed. Version 1.3 is a conversion of the
|
|
||||||
specification to RFC style.
|
|
||||||
|
|
||||||
2. Compressed representation overview
|
|
||||||
|
|
||||||
A compressed data set consists of a series of blocks, corresponding
|
|
||||||
to successive blocks of input data. The block sizes are arbitrary,
|
|
||||||
except that non-compressible blocks are limited to 65,535 bytes.
|
|
||||||
|
|
||||||
Each block is compressed using a combination of the LZ77 algorithm
|
|
||||||
and Huffman coding. The Huffman trees for each block are independent
|
|
||||||
of those for previous or subsequent blocks; the LZ77 algorithm may
|
|
||||||
use a reference to a duplicated string occurring in a previous block,
|
|
||||||
up to 32K input bytes before.
|
|
||||||
|
|
||||||
Each block consists of two parts: a pair of Huffman code trees that
|
|
||||||
describe the representation of the compressed data part, and a
|
|
||||||
compressed data part. (The Huffman trees themselves are compressed
|
|
||||||
using Huffman encoding.) The compressed data consists of a series of
|
|
||||||
elements of two types: literal bytes (of strings that have not been
|
|
||||||
detected as duplicated within the previous 32K input bytes), and
|
|
||||||
pointers to duplicated strings, where a pointer is represented as a
|
|
||||||
pair <length, backward distance>. The representation used in the
|
|
||||||
"deflate" format limits distances to 32K bytes and lengths to 258
|
|
||||||
bytes, but does not limit the size of a block, except for
|
|
||||||
uncompressible blocks, which are limited as noted above.
|
|
||||||
|
|
||||||
Each type of value (literals, distances, and lengths) in the
|
|
||||||
compressed data is represented using a Huffman code, using one code
|
|
||||||
tree for literals and lengths and a separate code tree for distances.
|
|
||||||
The code trees for each block appear in a compact form just before
|
|
||||||
the compressed data for that block.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 4]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
3. Detailed specification
|
|
||||||
|
|
||||||
3.1. Overall conventions In the diagrams below, a box like this:
|
|
||||||
|
|
||||||
+---+
|
|
||||||
| | <-- the vertical bars might be missing
|
|
||||||
+---+
|
|
||||||
|
|
||||||
represents one byte; a box like this:
|
|
||||||
|
|
||||||
+==============+
|
|
||||||
| |
|
|
||||||
+==============+
|
|
||||||
|
|
||||||
represents a variable number of bytes.
|
|
||||||
|
|
||||||
Bytes stored within a computer do not have a "bit order", since
|
|
||||||
they are always treated as a unit. However, a byte considered as
|
|
||||||
an integer between 0 and 255 does have a most- and least-
|
|
||||||
significant bit, and since we write numbers with the most-
|
|
||||||
significant digit on the left, we also write bytes with the most-
|
|
||||||
significant bit on the left. In the diagrams below, we number the
|
|
||||||
bits of a byte so that bit 0 is the least-significant bit, i.e.,
|
|
||||||
the bits are numbered:
|
|
||||||
|
|
||||||
+--------+
|
|
||||||
|76543210|
|
|
||||||
+--------+
|
|
||||||
|
|
||||||
Within a computer, a number may occupy multiple bytes. All
|
|
||||||
multi-byte numbers in the format described here are stored with
|
|
||||||
the least-significant byte first (at the lower memory address).
|
|
||||||
For example, the decimal number 520 is stored as:
|
|
||||||
|
|
||||||
0 1
|
|
||||||
+--------+--------+
|
|
||||||
|00001000|00000010|
|
|
||||||
+--------+--------+
|
|
||||||
^ ^
|
|
||||||
| |
|
|
||||||
| + more significant byte = 2 x 256
|
|
||||||
+ less significant byte = 8
|
|
||||||
|
|
||||||
3.1.1. Packing into bytes
|
|
||||||
|
|
||||||
This document does not address the issue of the order in which
|
|
||||||
bits of a byte are transmitted on a bit-sequential medium,
|
|
||||||
since the final data format described here is byte- rather than
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 5]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
bit-oriented. However, we describe the compressed block format
|
|
||||||
in below, as a sequence of data elements of various bit
|
|
||||||
lengths, not a sequence of bytes. We must therefore specify
|
|
||||||
how to pack these data elements into bytes to form the final
|
|
||||||
compressed byte sequence:
|
|
||||||
|
|
||||||
* Data elements are packed into bytes in order of
|
|
||||||
increasing bit number within the byte, i.e., starting
|
|
||||||
with the least-significant bit of the byte.
|
|
||||||
* Data elements other than Huffman codes are packed
|
|
||||||
starting with the least-significant bit of the data
|
|
||||||
element.
|
|
||||||
* Huffman codes are packed starting with the most-
|
|
||||||
significant bit of the code.
|
|
||||||
|
|
||||||
In other words, if one were to print out the compressed data as
|
|
||||||
a sequence of bytes, starting with the first byte at the
|
|
||||||
*right* margin and proceeding to the *left*, with the most-
|
|
||||||
significant bit of each byte on the left as usual, one would be
|
|
||||||
able to parse the result from right to left, with fixed-width
|
|
||||||
elements in the correct MSB-to-LSB order and Huffman codes in
|
|
||||||
bit-reversed order (i.e., with the first bit of the code in the
|
|
||||||
relative LSB position).
|
|
||||||
|
|
||||||
3.2. Compressed block format
|
|
||||||
|
|
||||||
3.2.1. Synopsis of prefix and Huffman coding
|
|
||||||
|
|
||||||
Prefix coding represents symbols from an a priori known
|
|
||||||
alphabet by bit sequences (codes), one code for each symbol, in
|
|
||||||
a manner such that different symbols may be represented by bit
|
|
||||||
sequences of different lengths, but a parser can always parse
|
|
||||||
an encoded string unambiguously symbol-by-symbol.
|
|
||||||
|
|
||||||
We define a prefix code in terms of a binary tree in which the
|
|
||||||
two edges descending from each non-leaf node are labeled 0 and
|
|
||||||
1 and in which the leaf nodes correspond one-for-one with (are
|
|
||||||
labeled with) the symbols of the alphabet; then the code for a
|
|
||||||
symbol is the sequence of 0's and 1's on the edges leading from
|
|
||||||
the root to the leaf labeled with that symbol. For example:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 6]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
/\ Symbol Code
|
|
||||||
0 1 ------ ----
|
|
||||||
/ \ A 00
|
|
||||||
/\ B B 1
|
|
||||||
0 1 C 011
|
|
||||||
/ \ D 010
|
|
||||||
A /\
|
|
||||||
0 1
|
|
||||||
/ \
|
|
||||||
D C
|
|
||||||
|
|
||||||
A parser can decode the next symbol from an encoded input
|
|
||||||
stream by walking down the tree from the root, at each step
|
|
||||||
choosing the edge corresponding to the next input bit.
|
|
||||||
|
|
||||||
Given an alphabet with known symbol frequencies, the Huffman
|
|
||||||
algorithm allows the construction of an optimal prefix code
|
|
||||||
(one which represents strings with those symbol frequencies
|
|
||||||
using the fewest bits of any possible prefix codes for that
|
|
||||||
alphabet). Such a code is called a Huffman code. (See
|
|
||||||
reference [1] in Chapter 5, references for additional
|
|
||||||
information on Huffman codes.)
|
|
||||||
|
|
||||||
Note that in the "deflate" format, the Huffman codes for the
|
|
||||||
various alphabets must not exceed certain maximum code lengths.
|
|
||||||
This constraint complicates the algorithm for computing code
|
|
||||||
lengths from symbol frequencies. Again, see Chapter 5,
|
|
||||||
references for details.
|
|
||||||
|
|
||||||
3.2.2. Use of Huffman coding in the "deflate" format
|
|
||||||
|
|
||||||
The Huffman codes used for each alphabet in the "deflate"
|
|
||||||
format have two additional rules:
|
|
||||||
|
|
||||||
* All codes of a given bit length have lexicographically
|
|
||||||
consecutive values, in the same order as the symbols
|
|
||||||
they represent;
|
|
||||||
|
|
||||||
* Shorter codes lexicographically precede longer codes.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 7]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
We could recode the example above to follow this rule as
|
|
||||||
follows, assuming that the order of the alphabet is ABCD:
|
|
||||||
|
|
||||||
Symbol Code
|
|
||||||
------ ----
|
|
||||||
A 10
|
|
||||||
B 0
|
|
||||||
C 110
|
|
||||||
D 111
|
|
||||||
|
|
||||||
I.e., 0 precedes 10 which precedes 11x, and 110 and 111 are
|
|
||||||
lexicographically consecutive.
|
|
||||||
|
|
||||||
Given this rule, we can define the Huffman code for an alphabet
|
|
||||||
just by giving the bit lengths of the codes for each symbol of
|
|
||||||
the alphabet in order; this is sufficient to determine the
|
|
||||||
actual codes. In our example, the code is completely defined
|
|
||||||
by the sequence of bit lengths (2, 1, 3, 3). The following
|
|
||||||
algorithm generates the codes as integers, intended to be read
|
|
||||||
from most- to least-significant bit. The code lengths are
|
|
||||||
initially in tree[I].Len; the codes are produced in
|
|
||||||
tree[I].Code.
|
|
||||||
|
|
||||||
1) Count the number of codes for each code length. Let
|
|
||||||
bl_count[N] be the number of codes of length N, N >= 1.
|
|
||||||
|
|
||||||
2) Find the numerical value of the smallest code for each
|
|
||||||
code length:
|
|
||||||
|
|
||||||
code = 0;
|
|
||||||
bl_count[0] = 0;
|
|
||||||
for (bits = 1; bits <= MAX_BITS; bits++) {
|
|
||||||
code = (code + bl_count[bits-1]) << 1;
|
|
||||||
next_code[bits] = code;
|
|
||||||
}
|
|
||||||
|
|
||||||
3) Assign numerical values to all codes, using consecutive
|
|
||||||
values for all codes of the same length with the base
|
|
||||||
values determined at step 2. Codes that are never used
|
|
||||||
(which have a bit length of zero) must not be assigned a
|
|
||||||
value.
|
|
||||||
|
|
||||||
for (n = 0; n <= max_code; n++) {
|
|
||||||
len = tree[n].Len;
|
|
||||||
if (len != 0) {
|
|
||||||
tree[n].Code = next_code[len];
|
|
||||||
next_code[len]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 8]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
Consider the alphabet ABCDEFGH, with bit lengths (3, 3, 3, 3,
|
|
||||||
3, 2, 4, 4). After step 1, we have:
|
|
||||||
|
|
||||||
N bl_count[N]
|
|
||||||
- -----------
|
|
||||||
2 1
|
|
||||||
3 5
|
|
||||||
4 2
|
|
||||||
|
|
||||||
Step 2 computes the following next_code values:
|
|
||||||
|
|
||||||
N next_code[N]
|
|
||||||
- ------------
|
|
||||||
1 0
|
|
||||||
2 0
|
|
||||||
3 2
|
|
||||||
4 14
|
|
||||||
|
|
||||||
Step 3 produces the following code values:
|
|
||||||
|
|
||||||
Symbol Length Code
|
|
||||||
------ ------ ----
|
|
||||||
A 3 010
|
|
||||||
B 3 011
|
|
||||||
C 3 100
|
|
||||||
D 3 101
|
|
||||||
E 3 110
|
|
||||||
F 2 00
|
|
||||||
G 4 1110
|
|
||||||
H 4 1111
|
|
||||||
|
|
||||||
3.2.3. Details of block format
|
|
||||||
|
|
||||||
Each block of compressed data begins with 3 header bits
|
|
||||||
containing the following data:
|
|
||||||
|
|
||||||
first bit BFINAL
|
|
||||||
next 2 bits BTYPE
|
|
||||||
|
|
||||||
Note that the header bits do not necessarily begin on a byte
|
|
||||||
boundary, since a block does not necessarily occupy an integral
|
|
||||||
number of bytes.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 9]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
BFINAL is set if and only if this is the last block of the data
|
|
||||||
set.
|
|
||||||
|
|
||||||
BTYPE specifies how the data are compressed, as follows:
|
|
||||||
|
|
||||||
00 - no compression
|
|
||||||
01 - compressed with fixed Huffman codes
|
|
||||||
10 - compressed with dynamic Huffman codes
|
|
||||||
11 - reserved (error)
|
|
||||||
|
|
||||||
The only difference between the two compressed cases is how the
|
|
||||||
Huffman codes for the literal/length and distance alphabets are
|
|
||||||
defined.
|
|
||||||
|
|
||||||
In all cases, the decoding algorithm for the actual data is as
|
|
||||||
follows:
|
|
||||||
|
|
||||||
do
|
|
||||||
read block header from input stream.
|
|
||||||
if stored with no compression
|
|
||||||
skip any remaining bits in current partially
|
|
||||||
processed byte
|
|
||||||
read LEN and NLEN (see next section)
|
|
||||||
copy LEN bytes of data to output
|
|
||||||
otherwise
|
|
||||||
if compressed with dynamic Huffman codes
|
|
||||||
read representation of code trees (see
|
|
||||||
subsection below)
|
|
||||||
loop (until end of block code recognized)
|
|
||||||
decode literal/length value from input stream
|
|
||||||
if value < 256
|
|
||||||
copy value (literal byte) to output stream
|
|
||||||
otherwise
|
|
||||||
if value = end of block (256)
|
|
||||||
break from loop
|
|
||||||
otherwise (value = 257..285)
|
|
||||||
decode distance from input stream
|
|
||||||
|
|
||||||
move backwards distance bytes in the output
|
|
||||||
stream, and copy length bytes from this
|
|
||||||
position to the output stream.
|
|
||||||
end loop
|
|
||||||
while not last block
|
|
||||||
|
|
||||||
Note that a duplicated string reference may refer to a string
|
|
||||||
in a previous block; i.e., the backward distance may cross one
|
|
||||||
or more block boundaries. However a distance cannot refer past
|
|
||||||
the beginning of the output stream. (An application using a
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 10]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
preset dictionary might discard part of the output stream; a
|
|
||||||
distance can refer to that part of the output stream anyway)
|
|
||||||
Note also that the referenced string may overlap the current
|
|
||||||
position; for example, if the last 2 bytes decoded have values
|
|
||||||
X and Y, a string reference with <length = 5, distance = 2>
|
|
||||||
adds X,Y,X,Y,X to the output stream.
|
|
||||||
|
|
||||||
We now specify each compression method in turn.
|
|
||||||
|
|
||||||
3.2.4. Non-compressed blocks (BTYPE=00)
|
|
||||||
|
|
||||||
Any bits of input up to the next byte boundary are ignored.
|
|
||||||
The rest of the block consists of the following information:
|
|
||||||
|
|
||||||
0 1 2 3 4...
|
|
||||||
+---+---+---+---+================================+
|
|
||||||
| LEN | NLEN |... LEN bytes of literal data...|
|
|
||||||
+---+---+---+---+================================+
|
|
||||||
|
|
||||||
LEN is the number of data bytes in the block. NLEN is the
|
|
||||||
one's complement of LEN.
|
|
||||||
|
|
||||||
3.2.5. Compressed blocks (length and distance codes)
|
|
||||||
|
|
||||||
As noted above, encoded data blocks in the "deflate" format
|
|
||||||
consist of sequences of symbols drawn from three conceptually
|
|
||||||
distinct alphabets: either literal bytes, from the alphabet of
|
|
||||||
byte values (0..255), or <length, backward distance> pairs,
|
|
||||||
where the length is drawn from (3..258) and the distance is
|
|
||||||
drawn from (1..32,768). In fact, the literal and length
|
|
||||||
alphabets are merged into a single alphabet (0..285), where
|
|
||||||
values 0..255 represent literal bytes, the value 256 indicates
|
|
||||||
end-of-block, and values 257..285 represent length codes
|
|
||||||
(possibly in conjunction with extra bits following the symbol
|
|
||||||
code) as follows:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 11]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
Extra Extra Extra
|
|
||||||
Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
|
|
||||||
---- ---- ------ ---- ---- ------- ---- ---- -------
|
|
||||||
257 0 3 267 1 15,16 277 4 67-82
|
|
||||||
258 0 4 268 1 17,18 278 4 83-98
|
|
||||||
259 0 5 269 2 19-22 279 4 99-114
|
|
||||||
260 0 6 270 2 23-26 280 4 115-130
|
|
||||||
261 0 7 271 2 27-30 281 5 131-162
|
|
||||||
262 0 8 272 2 31-34 282 5 163-194
|
|
||||||
263 0 9 273 3 35-42 283 5 195-226
|
|
||||||
264 0 10 274 3 43-50 284 5 227-257
|
|
||||||
265 1 11,12 275 3 51-58 285 0 258
|
|
||||||
266 1 13,14 276 3 59-66
|
|
||||||
|
|
||||||
The extra bits should be interpreted as a machine integer
|
|
||||||
stored with the most-significant bit first, e.g., bits 1110
|
|
||||||
represent the value 14.
|
|
||||||
|
|
||||||
Extra Extra Extra
|
|
||||||
Code Bits Dist Code Bits Dist Code Bits Distance
|
|
||||||
---- ---- ---- ---- ---- ------ ---- ---- --------
|
|
||||||
0 0 1 10 4 33-48 20 9 1025-1536
|
|
||||||
1 0 2 11 4 49-64 21 9 1537-2048
|
|
||||||
2 0 3 12 5 65-96 22 10 2049-3072
|
|
||||||
3 0 4 13 5 97-128 23 10 3073-4096
|
|
||||||
4 1 5,6 14 6 129-192 24 11 4097-6144
|
|
||||||
5 1 7,8 15 6 193-256 25 11 6145-8192
|
|
||||||
6 2 9-12 16 7 257-384 26 12 8193-12288
|
|
||||||
7 2 13-16 17 7 385-512 27 12 12289-16384
|
|
||||||
8 3 17-24 18 8 513-768 28 13 16385-24576
|
|
||||||
9 3 25-32 19 8 769-1024 29 13 24577-32768
|
|
||||||
|
|
||||||
3.2.6. Compression with fixed Huffman codes (BTYPE=01)
|
|
||||||
|
|
||||||
The Huffman codes for the two alphabets are fixed, and are not
|
|
||||||
represented explicitly in the data. The Huffman code lengths
|
|
||||||
for the literal/length alphabet are:
|
|
||||||
|
|
||||||
Lit Value Bits Codes
|
|
||||||
--------- ---- -----
|
|
||||||
0 - 143 8 00110000 through
|
|
||||||
10111111
|
|
||||||
144 - 255 9 110010000 through
|
|
||||||
111111111
|
|
||||||
256 - 279 7 0000000 through
|
|
||||||
0010111
|
|
||||||
280 - 287 8 11000000 through
|
|
||||||
11000111
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 12]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
The code lengths are sufficient to generate the actual codes,
|
|
||||||
as described above; we show the codes in the table for added
|
|
||||||
clarity. Literal/length values 286-287 will never actually
|
|
||||||
occur in the compressed data, but participate in the code
|
|
||||||
construction.
|
|
||||||
|
|
||||||
Distance codes 0-31 are represented by (fixed-length) 5-bit
|
|
||||||
codes, with possible additional bits as shown in the table
|
|
||||||
shown in Paragraph 3.2.5, above. Note that distance codes 30-
|
|
||||||
31 will never actually occur in the compressed data.
|
|
||||||
|
|
||||||
3.2.7. Compression with dynamic Huffman codes (BTYPE=10)
|
|
||||||
|
|
||||||
The Huffman codes for the two alphabets appear in the block
|
|
||||||
immediately after the header bits and before the actual
|
|
||||||
compressed data, first the literal/length code and then the
|
|
||||||
distance code. Each code is defined by a sequence of code
|
|
||||||
lengths, as discussed in Paragraph 3.2.2, above. For even
|
|
||||||
greater compactness, the code length sequences themselves are
|
|
||||||
compressed using a Huffman code. The alphabet for code lengths
|
|
||||||
is as follows:
|
|
||||||
|
|
||||||
0 - 15: Represent code lengths of 0 - 15
|
|
||||||
16: Copy the previous code length 3 - 6 times.
|
|
||||||
The next 2 bits indicate repeat length
|
|
||||||
(0 = 3, ... , 3 = 6)
|
|
||||||
Example: Codes 8, 16 (+2 bits 11),
|
|
||||||
16 (+2 bits 10) will expand to
|
|
||||||
12 code lengths of 8 (1 + 6 + 5)
|
|
||||||
17: Repeat a code length of 0 for 3 - 10 times.
|
|
||||||
(3 bits of length)
|
|
||||||
18: Repeat a code length of 0 for 11 - 138 times
|
|
||||||
(7 bits of length)
|
|
||||||
|
|
||||||
A code length of 0 indicates that the corresponding symbol in
|
|
||||||
the literal/length or distance alphabet will not occur in the
|
|
||||||
block, and should not participate in the Huffman code
|
|
||||||
construction algorithm given earlier. If only one distance
|
|
||||||
code is used, it is encoded using one bit, not zero bits; in
|
|
||||||
this case there is a single code length of one, with one unused
|
|
||||||
code. One distance code of zero bits means that there are no
|
|
||||||
distance codes used at all (the data is all literals).
|
|
||||||
|
|
||||||
We can now define the format of the block:
|
|
||||||
|
|
||||||
5 Bits: HLIT, # of Literal/Length codes - 257 (257 - 286)
|
|
||||||
5 Bits: HDIST, # of Distance codes - 1 (1 - 32)
|
|
||||||
4 Bits: HCLEN, # of Code Length codes - 4 (4 - 19)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 13]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
(HCLEN + 4) x 3 bits: code lengths for the code length
|
|
||||||
alphabet given just above, in the order: 16, 17, 18,
|
|
||||||
0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
|
|
||||||
|
|
||||||
These code lengths are interpreted as 3-bit integers
|
|
||||||
(0-7); as above, a code length of 0 means the
|
|
||||||
corresponding symbol (literal/length or distance code
|
|
||||||
length) is not used.
|
|
||||||
|
|
||||||
HLIT + 257 code lengths for the literal/length alphabet,
|
|
||||||
encoded using the code length Huffman code
|
|
||||||
|
|
||||||
HDIST + 1 code lengths for the distance alphabet,
|
|
||||||
encoded using the code length Huffman code
|
|
||||||
|
|
||||||
The actual compressed data of the block,
|
|
||||||
encoded using the literal/length and distance Huffman
|
|
||||||
codes
|
|
||||||
|
|
||||||
The literal/length symbol 256 (end of data),
|
|
||||||
encoded using the literal/length Huffman code
|
|
||||||
|
|
||||||
The code length repeat codes can cross from HLIT + 257 to the
|
|
||||||
HDIST + 1 code lengths. In other words, all code lengths form
|
|
||||||
a single sequence of HLIT + HDIST + 258 values.
|
|
||||||
|
|
||||||
3.3. Compliance
|
|
||||||
|
|
||||||
A compressor may limit further the ranges of values specified in
|
|
||||||
the previous section and still be compliant; for example, it may
|
|
||||||
limit the range of backward pointers to some value smaller than
|
|
||||||
32K. Similarly, a compressor may limit the size of blocks so that
|
|
||||||
a compressible block fits in memory.
|
|
||||||
|
|
||||||
A compliant decompressor must accept the full range of possible
|
|
||||||
values defined in the previous section, and must accept blocks of
|
|
||||||
arbitrary size.
|
|
||||||
|
|
||||||
4. Compression algorithm details
|
|
||||||
|
|
||||||
While it is the intent of this document to define the "deflate"
|
|
||||||
compressed data format without reference to any particular
|
|
||||||
compression algorithm, the format is related to the compressed
|
|
||||||
formats produced by LZ77 (Lempel-Ziv 1977, see reference [2] below);
|
|
||||||
since many variations of LZ77 are patented, it is strongly
|
|
||||||
recommended that the implementor of a compressor follow the general
|
|
||||||
algorithm presented here, which is known not to be patented per se.
|
|
||||||
The material in this section is not part of the definition of the
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 14]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
specification per se, and a compressor need not follow it in order to
|
|
||||||
be compliant.
|
|
||||||
|
|
||||||
The compressor terminates a block when it determines that starting a
|
|
||||||
new block with fresh trees would be useful, or when the block size
|
|
||||||
fills up the compressor's block buffer.
|
|
||||||
|
|
||||||
The compressor uses a chained hash table to find duplicated strings,
|
|
||||||
using a hash function that operates on 3-byte sequences. At any
|
|
||||||
given point during compression, let XYZ be the next 3 input bytes to
|
|
||||||
be examined (not necessarily all different, of course). First, the
|
|
||||||
compressor examines the hash chain for XYZ. If the chain is empty,
|
|
||||||
the compressor simply writes out X as a literal byte and advances one
|
|
||||||
byte in the input. If the hash chain is not empty, indicating that
|
|
||||||
the sequence XYZ (or, if we are unlucky, some other 3 bytes with the
|
|
||||||
same hash function value) has occurred recently, the compressor
|
|
||||||
compares all strings on the XYZ hash chain with the actual input data
|
|
||||||
sequence starting at the current point, and selects the longest
|
|
||||||
match.
|
|
||||||
|
|
||||||
The compressor searches the hash chains starting with the most recent
|
|
||||||
strings, to favor small distances and thus take advantage of the
|
|
||||||
Huffman encoding. The hash chains are singly linked. There are no
|
|
||||||
deletions from the hash chains; the algorithm simply discards matches
|
|
||||||
that are too old. To avoid a worst-case situation, very long hash
|
|
||||||
chains are arbitrarily truncated at a certain length, determined by a
|
|
||||||
run-time parameter.
|
|
||||||
|
|
||||||
To improve overall compression, the compressor optionally defers the
|
|
||||||
selection of matches ("lazy matching"): after a match of length N has
|
|
||||||
been found, the compressor searches for a longer match starting at
|
|
||||||
the next input byte. If it finds a longer match, it truncates the
|
|
||||||
previous match to a length of one (thus producing a single literal
|
|
||||||
byte) and then emits the longer match. Otherwise, it emits the
|
|
||||||
original match, and, as described above, advances N bytes before
|
|
||||||
continuing.
|
|
||||||
|
|
||||||
Run-time parameters also control this "lazy match" procedure. If
|
|
||||||
compression ratio is most important, the compressor attempts a
|
|
||||||
complete second search regardless of the length of the first match.
|
|
||||||
In the normal case, if the current match is "long enough", the
|
|
||||||
compressor reduces the search for a longer match, thus speeding up
|
|
||||||
the process. If speed is most important, the compressor inserts new
|
|
||||||
strings in the hash table only when no match was found, or when the
|
|
||||||
match is not "too long". This degrades the compression ratio but
|
|
||||||
saves time since there are both fewer insertions and fewer searches.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 15]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
5. References
|
|
||||||
|
|
||||||
[1] Huffman, D. A., "A Method for the Construction of Minimum
|
|
||||||
Redundancy Codes", Proceedings of the Institute of Radio
|
|
||||||
Engineers, September 1952, Volume 40, Number 9, pp. 1098-1101.
|
|
||||||
|
|
||||||
[2] Ziv J., Lempel A., "A Universal Algorithm for Sequential Data
|
|
||||||
Compression", IEEE Transactions on Information Theory, Vol. 23,
|
|
||||||
No. 3, pp. 337-343.
|
|
||||||
|
|
||||||
[3] Gailly, J.-L., and Adler, M., ZLIB documentation and sources,
|
|
||||||
available in ftp://ftp.uu.net/pub/archiving/zip/doc/
|
|
||||||
|
|
||||||
[4] Gailly, J.-L., and Adler, M., GZIP documentation and sources,
|
|
||||||
available as gzip-*.tar in ftp://prep.ai.mit.edu/pub/gnu/
|
|
||||||
|
|
||||||
[5] Schwartz, E. S., and Kallick, B. "Generating a canonical prefix
|
|
||||||
encoding." Comm. ACM, 7,3 (Mar. 1964), pp. 166-169.
|
|
||||||
|
|
||||||
[6] Hirschberg and Lelewer, "Efficient decoding of prefix codes,"
|
|
||||||
Comm. ACM, 33,4, April 1990, pp. 449-459.
|
|
||||||
|
|
||||||
6. Security Considerations
|
|
||||||
|
|
||||||
Any data compression method involves the reduction of redundancy in
|
|
||||||
the data. Consequently, any corruption of the data is likely to have
|
|
||||||
severe effects and be difficult to correct. Uncompressed text, on
|
|
||||||
the other hand, will probably still be readable despite the presence
|
|
||||||
of some corrupted bytes.
|
|
||||||
|
|
||||||
It is recommended that systems using this data format provide some
|
|
||||||
means of validating the integrity of the compressed data. See
|
|
||||||
reference [3], for example.
|
|
||||||
|
|
||||||
7. Source code
|
|
||||||
|
|
||||||
Source code for a C language implementation of a "deflate" compliant
|
|
||||||
compressor and decompressor is available within the zlib package at
|
|
||||||
ftp://ftp.uu.net/pub/archiving/zip/zlib/.
|
|
||||||
|
|
||||||
8. Acknowledgements
|
|
||||||
|
|
||||||
Trademarks cited in this document are the property of their
|
|
||||||
respective owners.
|
|
||||||
|
|
||||||
Phil Katz designed the deflate format. Jean-Loup Gailly and Mark
|
|
||||||
Adler wrote the related software described in this specification.
|
|
||||||
Glenn Randers-Pehrson converted this document to RFC and HTML format.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 16]
|
|
||||||
|
|
||||||
RFC 1951 DEFLATE Compressed Data Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
9. Author's Address
|
|
||||||
|
|
||||||
L. Peter Deutsch
|
|
||||||
Aladdin Enterprises
|
|
||||||
203 Santa Margarita Ave.
|
|
||||||
Menlo Park, CA 94025
|
|
||||||
|
|
||||||
Phone: (415) 322-0103 (AM only)
|
|
||||||
FAX: (415) 322-1734
|
|
||||||
EMail: <ghost@aladdin.com>
|
|
||||||
|
|
||||||
Questions about the technical content of this specification can be
|
|
||||||
sent by email to:
|
|
||||||
|
|
||||||
Jean-Loup Gailly <gzip@prep.ai.mit.edu> and
|
|
||||||
Mark Adler <madler@alumni.caltech.edu>
|
|
||||||
|
|
||||||
Editorial comments on this specification can be sent by email to:
|
|
||||||
|
|
||||||
L. Peter Deutsch <ghost@aladdin.com> and
|
|
||||||
Glenn Randers-Pehrson <randeg@alumni.rpi.edu>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 17]
|
|
||||||
|
|
||||||
BIN
lib/std/compress/testdata/rfc1951.txt.fixed.z.9
vendored
BIN
lib/std/compress/testdata/rfc1951.txt.fixed.z.9
vendored
Binary file not shown.
BIN
lib/std/compress/testdata/rfc1951.txt.z.0
vendored
BIN
lib/std/compress/testdata/rfc1951.txt.z.0
vendored
Binary file not shown.
BIN
lib/std/compress/testdata/rfc1951.txt.z.9
vendored
BIN
lib/std/compress/testdata/rfc1951.txt.z.9
vendored
Binary file not shown.
675
lib/std/compress/testdata/rfc1952.txt
vendored
675
lib/std/compress/testdata/rfc1952.txt
vendored
@ -1,675 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Network Working Group P. Deutsch
|
|
||||||
Request for Comments: 1952 Aladdin Enterprises
|
|
||||||
Category: Informational May 1996
|
|
||||||
|
|
||||||
|
|
||||||
GZIP file format specification version 4.3
|
|
||||||
|
|
||||||
Status of This Memo
|
|
||||||
|
|
||||||
This memo provides information for the Internet community. This memo
|
|
||||||
does not specify an Internet standard of any kind. Distribution of
|
|
||||||
this memo is unlimited.
|
|
||||||
|
|
||||||
IESG Note:
|
|
||||||
|
|
||||||
The IESG takes no position on the validity of any Intellectual
|
|
||||||
Property Rights statements contained in this document.
|
|
||||||
|
|
||||||
Notices
|
|
||||||
|
|
||||||
Copyright (c) 1996 L. Peter Deutsch
|
|
||||||
|
|
||||||
Permission is granted to copy and distribute this document for any
|
|
||||||
purpose and without charge, including translations into other
|
|
||||||
languages and incorporation into compilations, provided that the
|
|
||||||
copyright notice and this notice are preserved, and that any
|
|
||||||
substantive changes or deletions from the original are clearly
|
|
||||||
marked.
|
|
||||||
|
|
||||||
A pointer to the latest version of this and related documentation in
|
|
||||||
HTML format can be found at the URL
|
|
||||||
<ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>.
|
|
||||||
|
|
||||||
Abstract
|
|
||||||
|
|
||||||
This specification defines a lossless compressed data format that is
|
|
||||||
compatible with the widely used GZIP utility. The format includes a
|
|
||||||
cyclic redundancy check value for detecting data corruption. The
|
|
||||||
format presently uses the DEFLATE method of compression but can be
|
|
||||||
easily extended to use other compression methods. The format can be
|
|
||||||
implemented readily in a manner not covered by patents.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 1]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
Table of Contents
|
|
||||||
|
|
||||||
1. Introduction ................................................... 2
|
|
||||||
1.1. Purpose ................................................... 2
|
|
||||||
1.2. Intended audience ......................................... 3
|
|
||||||
1.3. Scope ..................................................... 3
|
|
||||||
1.4. Compliance ................................................ 3
|
|
||||||
1.5. Definitions of terms and conventions used ................. 3
|
|
||||||
1.6. Changes from previous versions ............................ 3
|
|
||||||
2. Detailed specification ......................................... 4
|
|
||||||
2.1. Overall conventions ....................................... 4
|
|
||||||
2.2. File format ............................................... 5
|
|
||||||
2.3. Member format ............................................. 5
|
|
||||||
2.3.1. Member header and trailer ........................... 6
|
|
||||||
2.3.1.1. Extra field ................................... 8
|
|
||||||
2.3.1.2. Compliance .................................... 9
|
|
||||||
3. References .................................................. 9
|
|
||||||
4. Security Considerations .................................... 10
|
|
||||||
5. Acknowledgements ........................................... 10
|
|
||||||
6. Author's Address ........................................... 10
|
|
||||||
7. Appendix: Jean-Loup Gailly's gzip utility .................. 11
|
|
||||||
8. Appendix: Sample CRC Code .................................. 11
|
|
||||||
|
|
||||||
1. Introduction
|
|
||||||
|
|
||||||
1.1. Purpose
|
|
||||||
|
|
||||||
The purpose of this specification is to define a lossless
|
|
||||||
compressed data format that:
|
|
||||||
|
|
||||||
* Is independent of CPU type, operating system, file system,
|
|
||||||
and character set, and hence can be used for interchange;
|
|
||||||
* Can compress or decompress a data stream (as opposed to a
|
|
||||||
randomly accessible file) to produce another data stream,
|
|
||||||
using only an a priori bounded amount of intermediate
|
|
||||||
storage, and hence can be used in data communications or
|
|
||||||
similar structures such as Unix filters;
|
|
||||||
* Compresses data with efficiency comparable to the best
|
|
||||||
currently available general-purpose compression methods,
|
|
||||||
and in particular considerably better than the "compress"
|
|
||||||
program;
|
|
||||||
* Can be implemented readily in a manner not covered by
|
|
||||||
patents, and hence can be practiced freely;
|
|
||||||
* Is compatible with the file format produced by the current
|
|
||||||
widely used gzip utility, in that conforming decompressors
|
|
||||||
will be able to read data produced by the existing gzip
|
|
||||||
compressor.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 2]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
The data format defined by this specification does not attempt to:
|
|
||||||
|
|
||||||
* Provide random access to compressed data;
|
|
||||||
* Compress specialized data (e.g., raster graphics) as well as
|
|
||||||
the best currently available specialized algorithms.
|
|
||||||
|
|
||||||
1.2. Intended audience
|
|
||||||
|
|
||||||
This specification is intended for use by implementors of software
|
|
||||||
to compress data into gzip format and/or decompress data from gzip
|
|
||||||
format.
|
|
||||||
|
|
||||||
The text of the specification assumes a basic background in
|
|
||||||
programming at the level of bits and other primitive data
|
|
||||||
representations.
|
|
||||||
|
|
||||||
1.3. Scope
|
|
||||||
|
|
||||||
The specification specifies a compression method and a file format
|
|
||||||
(the latter assuming only that a file can store a sequence of
|
|
||||||
arbitrary bytes). It does not specify any particular interface to
|
|
||||||
a file system or anything about character sets or encodings
|
|
||||||
(except for file names and comments, which are optional).
|
|
||||||
|
|
||||||
1.4. Compliance
|
|
||||||
|
|
||||||
Unless otherwise indicated below, a compliant decompressor must be
|
|
||||||
able to accept and decompress any file that conforms to all the
|
|
||||||
specifications presented here; a compliant compressor must produce
|
|
||||||
files that conform to all the specifications presented here. The
|
|
||||||
material in the appendices is not part of the specification per se
|
|
||||||
and is not relevant to compliance.
|
|
||||||
|
|
||||||
1.5. Definitions of terms and conventions used
|
|
||||||
|
|
||||||
byte: 8 bits stored or transmitted as a unit (same as an octet).
|
|
||||||
(For this specification, a byte is exactly 8 bits, even on
|
|
||||||
machines which store a character on a number of bits different
|
|
||||||
from 8.) See below for the numbering of bits within a byte.
|
|
||||||
|
|
||||||
1.6. Changes from previous versions
|
|
||||||
|
|
||||||
There have been no technical changes to the gzip format since
|
|
||||||
version 4.1 of this specification. In version 4.2, some
|
|
||||||
terminology was changed, and the sample CRC code was rewritten for
|
|
||||||
clarity and to eliminate the requirement for the caller to do pre-
|
|
||||||
and post-conditioning. Version 4.3 is a conversion of the
|
|
||||||
specification to RFC style.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 3]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
2. Detailed specification
|
|
||||||
|
|
||||||
2.1. Overall conventions
|
|
||||||
|
|
||||||
In the diagrams below, a box like this:
|
|
||||||
|
|
||||||
+---+
|
|
||||||
| | <-- the vertical bars might be missing
|
|
||||||
+---+
|
|
||||||
|
|
||||||
represents one byte; a box like this:
|
|
||||||
|
|
||||||
+==============+
|
|
||||||
| |
|
|
||||||
+==============+
|
|
||||||
|
|
||||||
represents a variable number of bytes.
|
|
||||||
|
|
||||||
Bytes stored within a computer do not have a "bit order", since
|
|
||||||
they are always treated as a unit. However, a byte considered as
|
|
||||||
an integer between 0 and 255 does have a most- and least-
|
|
||||||
significant bit, and since we write numbers with the most-
|
|
||||||
significant digit on the left, we also write bytes with the most-
|
|
||||||
significant bit on the left. In the diagrams below, we number the
|
|
||||||
bits of a byte so that bit 0 is the least-significant bit, i.e.,
|
|
||||||
the bits are numbered:
|
|
||||||
|
|
||||||
+--------+
|
|
||||||
|76543210|
|
|
||||||
+--------+
|
|
||||||
|
|
||||||
This document does not address the issue of the order in which
|
|
||||||
bits of a byte are transmitted on a bit-sequential medium, since
|
|
||||||
the data format described here is byte- rather than bit-oriented.
|
|
||||||
|
|
||||||
Within a computer, a number may occupy multiple bytes. All
|
|
||||||
multi-byte numbers in the format described here are stored with
|
|
||||||
the least-significant byte first (at the lower memory address).
|
|
||||||
For example, the decimal number 520 is stored as:
|
|
||||||
|
|
||||||
0 1
|
|
||||||
+--------+--------+
|
|
||||||
|00001000|00000010|
|
|
||||||
+--------+--------+
|
|
||||||
^ ^
|
|
||||||
| |
|
|
||||||
| + more significant byte = 2 x 256
|
|
||||||
+ less significant byte = 8
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 4]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
2.2. File format
|
|
||||||
|
|
||||||
A gzip file consists of a series of "members" (compressed data
|
|
||||||
sets). The format of each member is specified in the following
|
|
||||||
section. The members simply appear one after another in the file,
|
|
||||||
with no additional information before, between, or after them.
|
|
||||||
|
|
||||||
2.3. Member format
|
|
||||||
|
|
||||||
Each member has the following structure:
|
|
||||||
|
|
||||||
+---+---+---+---+---+---+---+---+---+---+
|
|
||||||
|ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
|
|
||||||
+---+---+---+---+---+---+---+---+---+---+
|
|
||||||
|
|
||||||
(if FLG.FEXTRA set)
|
|
||||||
|
|
||||||
+---+---+=================================+
|
|
||||||
| XLEN |...XLEN bytes of "extra field"...| (more-->)
|
|
||||||
+---+---+=================================+
|
|
||||||
|
|
||||||
(if FLG.FNAME set)
|
|
||||||
|
|
||||||
+=========================================+
|
|
||||||
|...original file name, zero-terminated...| (more-->)
|
|
||||||
+=========================================+
|
|
||||||
|
|
||||||
(if FLG.FCOMMENT set)
|
|
||||||
|
|
||||||
+===================================+
|
|
||||||
|...file comment, zero-terminated...| (more-->)
|
|
||||||
+===================================+
|
|
||||||
|
|
||||||
(if FLG.FHCRC set)
|
|
||||||
|
|
||||||
+---+---+
|
|
||||||
| CRC16 |
|
|
||||||
+---+---+
|
|
||||||
|
|
||||||
+=======================+
|
|
||||||
|...compressed blocks...| (more-->)
|
|
||||||
+=======================+
|
|
||||||
|
|
||||||
0 1 2 3 4 5 6 7
|
|
||||||
+---+---+---+---+---+---+---+---+
|
|
||||||
| CRC32 | ISIZE |
|
|
||||||
+---+---+---+---+---+---+---+---+
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 5]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
2.3.1. Member header and trailer
|
|
||||||
|
|
||||||
ID1 (IDentification 1)
|
|
||||||
ID2 (IDentification 2)
|
|
||||||
These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
|
|
||||||
(0x8b, \213), to identify the file as being in gzip format.
|
|
||||||
|
|
||||||
CM (Compression Method)
|
|
||||||
This identifies the compression method used in the file. CM
|
|
||||||
= 0-7 are reserved. CM = 8 denotes the "deflate"
|
|
||||||
compression method, which is the one customarily used by
|
|
||||||
gzip and which is documented elsewhere.
|
|
||||||
|
|
||||||
FLG (FLaGs)
|
|
||||||
This flag byte is divided into individual bits as follows:
|
|
||||||
|
|
||||||
bit 0 FTEXT
|
|
||||||
bit 1 FHCRC
|
|
||||||
bit 2 FEXTRA
|
|
||||||
bit 3 FNAME
|
|
||||||
bit 4 FCOMMENT
|
|
||||||
bit 5 reserved
|
|
||||||
bit 6 reserved
|
|
||||||
bit 7 reserved
|
|
||||||
|
|
||||||
If FTEXT is set, the file is probably ASCII text. This is
|
|
||||||
an optional indication, which the compressor may set by
|
|
||||||
checking a small amount of the input data to see whether any
|
|
||||||
non-ASCII characters are present. In case of doubt, FTEXT
|
|
||||||
is cleared, indicating binary data. For systems which have
|
|
||||||
different file formats for ascii text and binary data, the
|
|
||||||
decompressor can use FTEXT to choose the appropriate format.
|
|
||||||
We deliberately do not specify the algorithm used to set
|
|
||||||
this bit, since a compressor always has the option of
|
|
||||||
leaving it cleared and a decompressor always has the option
|
|
||||||
of ignoring it and letting some other program handle issues
|
|
||||||
of data conversion.
|
|
||||||
|
|
||||||
If FHCRC is set, a CRC16 for the gzip header is present,
|
|
||||||
immediately before the compressed data. The CRC16 consists
|
|
||||||
of the two least significant bytes of the CRC32 for all
|
|
||||||
bytes of the gzip header up to and not including the CRC16.
|
|
||||||
[The FHCRC bit was never set by versions of gzip up to
|
|
||||||
1.2.4, even though it was documented with a different
|
|
||||||
meaning in gzip 1.2.4.]
|
|
||||||
|
|
||||||
If FEXTRA is set, optional extra fields are present, as
|
|
||||||
described in a following section.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 6]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
If FNAME is set, an original file name is present,
|
|
||||||
terminated by a zero byte. The name must consist of ISO
|
|
||||||
8859-1 (LATIN-1) characters; on operating systems using
|
|
||||||
EBCDIC or any other character set for file names, the name
|
|
||||||
must be translated to the ISO LATIN-1 character set. This
|
|
||||||
is the original name of the file being compressed, with any
|
|
||||||
directory components removed, and, if the file being
|
|
||||||
compressed is on a file system with case insensitive names,
|
|
||||||
forced to lower case. There is no original file name if the
|
|
||||||
data was compressed from a source other than a named file;
|
|
||||||
for example, if the source was stdin on a Unix system, there
|
|
||||||
is no file name.
|
|
||||||
|
|
||||||
If FCOMMENT is set, a zero-terminated file comment is
|
|
||||||
present. This comment is not interpreted; it is only
|
|
||||||
intended for human consumption. The comment must consist of
|
|
||||||
ISO 8859-1 (LATIN-1) characters. Line breaks should be
|
|
||||||
denoted by a single line feed character (10 decimal).
|
|
||||||
|
|
||||||
Reserved FLG bits must be zero.
|
|
||||||
|
|
||||||
MTIME (Modification TIME)
|
|
||||||
This gives the most recent modification time of the original
|
|
||||||
file being compressed. The time is in Unix format, i.e.,
|
|
||||||
seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this
|
|
||||||
may cause problems for MS-DOS and other systems that use
|
|
||||||
local rather than Universal time.) If the compressed data
|
|
||||||
did not come from a file, MTIME is set to the time at which
|
|
||||||
compression started. MTIME = 0 means no time stamp is
|
|
||||||
available.
|
|
||||||
|
|
||||||
XFL (eXtra FLags)
|
|
||||||
These flags are available for use by specific compression
|
|
||||||
methods. The "deflate" method (CM = 8) sets these flags as
|
|
||||||
follows:
|
|
||||||
|
|
||||||
XFL = 2 - compressor used maximum compression,
|
|
||||||
slowest algorithm
|
|
||||||
XFL = 4 - compressor used fastest algorithm
|
|
||||||
|
|
||||||
OS (Operating System)
|
|
||||||
This identifies the type of file system on which compression
|
|
||||||
took place. This may be useful in determining end-of-line
|
|
||||||
convention for text files. The currently defined values are
|
|
||||||
as follows:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 7]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
|
|
||||||
1 - Amiga
|
|
||||||
2 - VMS (or OpenVMS)
|
|
||||||
3 - Unix
|
|
||||||
4 - VM/CMS
|
|
||||||
5 - Atari TOS
|
|
||||||
6 - HPFS filesystem (OS/2, NT)
|
|
||||||
7 - Macintosh
|
|
||||||
8 - Z-System
|
|
||||||
9 - CP/M
|
|
||||||
10 - TOPS-20
|
|
||||||
11 - NTFS filesystem (NT)
|
|
||||||
12 - QDOS
|
|
||||||
13 - Acorn RISCOS
|
|
||||||
255 - unknown
|
|
||||||
|
|
||||||
XLEN (eXtra LENgth)
|
|
||||||
If FLG.FEXTRA is set, this gives the length of the optional
|
|
||||||
extra field. See below for details.
|
|
||||||
|
|
||||||
CRC32 (CRC-32)
|
|
||||||
This contains a Cyclic Redundancy Check value of the
|
|
||||||
uncompressed data computed according to CRC-32 algorithm
|
|
||||||
used in the ISO 3309 standard and in section 8.1.1.6.2 of
|
|
||||||
ITU-T recommendation V.42. (See http://www.iso.ch for
|
|
||||||
ordering ISO documents. See gopher://info.itu.ch for an
|
|
||||||
online version of ITU-T V.42.)
|
|
||||||
|
|
||||||
ISIZE (Input SIZE)
|
|
||||||
This contains the size of the original (uncompressed) input
|
|
||||||
data modulo 2^32.
|
|
||||||
|
|
||||||
2.3.1.1. Extra field
|
|
||||||
|
|
||||||
If the FLG.FEXTRA bit is set, an "extra field" is present in
|
|
||||||
the header, with total length XLEN bytes. It consists of a
|
|
||||||
series of subfields, each of the form:
|
|
||||||
|
|
||||||
+---+---+---+---+==================================+
|
|
||||||
|SI1|SI2| LEN |... LEN bytes of subfield data ...|
|
|
||||||
+---+---+---+---+==================================+
|
|
||||||
|
|
||||||
SI1 and SI2 provide a subfield ID, typically two ASCII letters
|
|
||||||
with some mnemonic value. Jean-Loup Gailly
|
|
||||||
<gzip@prep.ai.mit.edu> is maintaining a registry of subfield
|
|
||||||
IDs; please send him any subfield ID you wish to use. Subfield
|
|
||||||
IDs with SI2 = 0 are reserved for future use. The following
|
|
||||||
IDs are currently defined:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 8]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
SI1 SI2 Data
|
|
||||||
---------- ---------- ----
|
|
||||||
0x41 ('A') 0x70 ('P') Apollo file type information
|
|
||||||
|
|
||||||
LEN gives the length of the subfield data, excluding the 4
|
|
||||||
initial bytes.
|
|
||||||
|
|
||||||
2.3.1.2. Compliance
|
|
||||||
|
|
||||||
A compliant compressor must produce files with correct ID1,
|
|
||||||
ID2, CM, CRC32, and ISIZE, but may set all the other fields in
|
|
||||||
the fixed-length part of the header to default values (255 for
|
|
||||||
OS, 0 for all others). The compressor must set all reserved
|
|
||||||
bits to zero.
|
|
||||||
|
|
||||||
A compliant decompressor must check ID1, ID2, and CM, and
|
|
||||||
provide an error indication if any of these have incorrect
|
|
||||||
values. It must examine FEXTRA/XLEN, FNAME, FCOMMENT and FHCRC
|
|
||||||
at least so it can skip over the optional fields if they are
|
|
||||||
present. It need not examine any other part of the header or
|
|
||||||
trailer; in particular, a decompressor may ignore FTEXT and OS
|
|
||||||
and always produce binary output, and still be compliant. A
|
|
||||||
compliant decompressor must give an error indication if any
|
|
||||||
reserved bit is non-zero, since such a bit could indicate the
|
|
||||||
presence of a new field that would cause subsequent data to be
|
|
||||||
interpreted incorrectly.
|
|
||||||
|
|
||||||
3. References
|
|
||||||
|
|
||||||
[1] "Information Processing - 8-bit single-byte coded graphic
|
|
||||||
character sets - Part 1: Latin alphabet No.1" (ISO 8859-1:1987).
|
|
||||||
The ISO 8859-1 (Latin-1) character set is a superset of 7-bit
|
|
||||||
ASCII. Files defining this character set are available as
|
|
||||||
iso_8859-1.* in ftp://ftp.uu.net/graphics/png/documents/
|
|
||||||
|
|
||||||
[2] ISO 3309
|
|
||||||
|
|
||||||
[3] ITU-T recommendation V.42
|
|
||||||
|
|
||||||
[4] Deutsch, L.P.,"DEFLATE Compressed Data Format Specification",
|
|
||||||
available in ftp://ftp.uu.net/pub/archiving/zip/doc/
|
|
||||||
|
|
||||||
[5] Gailly, J.-L., GZIP documentation, available as gzip-*.tar in
|
|
||||||
ftp://prep.ai.mit.edu/pub/gnu/
|
|
||||||
|
|
||||||
[6] Sarwate, D.V., "Computation of Cyclic Redundancy Checks via Table
|
|
||||||
Look-Up", Communications of the ACM, 31(8), pp.1008-1013.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 9]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
[7] Schwaderer, W.D., "CRC Calculation", April 85 PC Tech Journal,
|
|
||||||
pp.118-133.
|
|
||||||
|
|
||||||
[8] ftp://ftp.adelaide.edu.au/pub/rocksoft/papers/crc_v3.txt,
|
|
||||||
describing the CRC concept.
|
|
||||||
|
|
||||||
4. Security Considerations
|
|
||||||
|
|
||||||
Any data compression method involves the reduction of redundancy in
|
|
||||||
the data. Consequently, any corruption of the data is likely to have
|
|
||||||
severe effects and be difficult to correct. Uncompressed text, on
|
|
||||||
the other hand, will probably still be readable despite the presence
|
|
||||||
of some corrupted bytes.
|
|
||||||
|
|
||||||
It is recommended that systems using this data format provide some
|
|
||||||
means of validating the integrity of the compressed data, such as by
|
|
||||||
setting and checking the CRC-32 check value.
|
|
||||||
|
|
||||||
5. Acknowledgements
|
|
||||||
|
|
||||||
Trademarks cited in this document are the property of their
|
|
||||||
respective owners.
|
|
||||||
|
|
||||||
Jean-Loup Gailly designed the gzip format and wrote, with Mark Adler,
|
|
||||||
the related software described in this specification. Glenn
|
|
||||||
Randers-Pehrson converted this document to RFC and HTML format.
|
|
||||||
|
|
||||||
6. Author's Address
|
|
||||||
|
|
||||||
L. Peter Deutsch
|
|
||||||
Aladdin Enterprises
|
|
||||||
203 Santa Margarita Ave.
|
|
||||||
Menlo Park, CA 94025
|
|
||||||
|
|
||||||
Phone: (415) 322-0103 (AM only)
|
|
||||||
FAX: (415) 322-1734
|
|
||||||
EMail: <ghost@aladdin.com>
|
|
||||||
|
|
||||||
Questions about the technical content of this specification can be
|
|
||||||
sent by email to:
|
|
||||||
|
|
||||||
Jean-Loup Gailly <gzip@prep.ai.mit.edu> and
|
|
||||||
Mark Adler <madler@alumni.caltech.edu>
|
|
||||||
|
|
||||||
Editorial comments on this specification can be sent by email to:
|
|
||||||
|
|
||||||
L. Peter Deutsch <ghost@aladdin.com> and
|
|
||||||
Glenn Randers-Pehrson <randeg@alumni.rpi.edu>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 10]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
7. Appendix: Jean-Loup Gailly's gzip utility
|
|
||||||
|
|
||||||
The most widely used implementation of gzip compression, and the
|
|
||||||
original documentation on which this specification is based, were
|
|
||||||
created by Jean-Loup Gailly <gzip@prep.ai.mit.edu>. Since this
|
|
||||||
implementation is a de facto standard, we mention some more of its
|
|
||||||
features here. Again, the material in this section is not part of
|
|
||||||
the specification per se, and implementations need not follow it to
|
|
||||||
be compliant.
|
|
||||||
|
|
||||||
When compressing or decompressing a file, gzip preserves the
|
|
||||||
protection, ownership, and modification time attributes on the local
|
|
||||||
file system, since there is no provision for representing protection
|
|
||||||
attributes in the gzip file format itself. Since the file format
|
|
||||||
includes a modification time, the gzip decompressor provides a
|
|
||||||
command line switch that assigns the modification time from the file,
|
|
||||||
rather than the local modification time of the compressed input, to
|
|
||||||
the decompressed output.
|
|
||||||
|
|
||||||
8. Appendix: Sample CRC Code
|
|
||||||
|
|
||||||
The following sample code represents a practical implementation of
|
|
||||||
the CRC (Cyclic Redundancy Check). (See also ISO 3309 and ITU-T V.42
|
|
||||||
for a formal specification.)
|
|
||||||
|
|
||||||
The sample code is in the ANSI C programming language. Non C users
|
|
||||||
may find it easier to read with these hints:
|
|
||||||
|
|
||||||
& Bitwise AND operator.
|
|
||||||
^ Bitwise exclusive-OR operator.
|
|
||||||
>> Bitwise right shift operator. When applied to an
|
|
||||||
unsigned quantity, as here, right shift inserts zero
|
|
||||||
bit(s) at the left.
|
|
||||||
! Logical NOT operator.
|
|
||||||
++ "n++" increments the variable n.
|
|
||||||
0xNNN 0x introduces a hexadecimal (base 16) constant.
|
|
||||||
Suffix L indicates a long value (at least 32 bits).
|
|
||||||
|
|
||||||
/* Table of CRCs of all 8-bit messages. */
|
|
||||||
unsigned long crc_table[256];
|
|
||||||
|
|
||||||
/* Flag: has the table been computed? Initially false. */
|
|
||||||
int crc_table_computed = 0;
|
|
||||||
|
|
||||||
/* Make the table for a fast CRC. */
|
|
||||||
void make_crc_table(void)
|
|
||||||
{
|
|
||||||
unsigned long c;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 11]
|
|
||||||
|
|
||||||
RFC 1952 GZIP File Format Specification May 1996
|
|
||||||
|
|
||||||
|
|
||||||
int n, k;
|
|
||||||
for (n = 0; n < 256; n++) {
|
|
||||||
c = (unsigned long) n;
|
|
||||||
for (k = 0; k < 8; k++) {
|
|
||||||
if (c & 1) {
|
|
||||||
c = 0xedb88320L ^ (c >> 1);
|
|
||||||
} else {
|
|
||||||
c = c >> 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
crc_table[n] = c;
|
|
||||||
}
|
|
||||||
crc_table_computed = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
Update a running crc with the bytes buf[0..len-1] and return
|
|
||||||
the updated crc. The crc should be initialized to zero. Pre- and
|
|
||||||
post-conditioning (one's complement) is performed within this
|
|
||||||
function so it shouldn't be done by the caller. Usage example:
|
|
||||||
|
|
||||||
unsigned long crc = 0L;
|
|
||||||
|
|
||||||
while (read_buffer(buffer, length) != EOF) {
|
|
||||||
crc = update_crc(crc, buffer, length);
|
|
||||||
}
|
|
||||||
if (crc != original_crc) error();
|
|
||||||
*/
|
|
||||||
unsigned long update_crc(unsigned long crc,
|
|
||||||
unsigned char *buf, int len)
|
|
||||||
{
|
|
||||||
unsigned long c = crc ^ 0xffffffffL;
|
|
||||||
int n;
|
|
||||||
|
|
||||||
if (!crc_table_computed)
|
|
||||||
make_crc_table();
|
|
||||||
for (n = 0; n < len; n++) {
|
|
||||||
c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
|
|
||||||
}
|
|
||||||
return c ^ 0xffffffffL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Return the CRC of the bytes buf[0..len-1]. */
|
|
||||||
unsigned long crc(unsigned char *buf, int len)
|
|
||||||
{
|
|
||||||
return update_crc(0L, buf, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Deutsch Informational [Page 12]
|
|
||||||
|
|
||||||
BIN
lib/std/compress/testdata/rfc1952.txt.gz
vendored
BIN
lib/std/compress/testdata/rfc1952.txt.gz
vendored
Binary file not shown.
@ -1,282 +0,0 @@
|
|||||||
//
|
|
||||||
// Compressor/Decompressor for ZLIB data streams (RFC1950)
|
|
||||||
|
|
||||||
const std = @import("std");
|
|
||||||
const io = std.io;
|
|
||||||
const fs = std.fs;
|
|
||||||
const testing = std.testing;
|
|
||||||
const mem = std.mem;
|
|
||||||
const deflate = @import("deflate.zig");
|
|
||||||
|
|
||||||
// Zlib header format as specified in RFC1950
|
|
||||||
const ZLibHeader = packed struct {
|
|
||||||
checksum: u5,
|
|
||||||
preset_dict: u1,
|
|
||||||
compression_level: u2,
|
|
||||||
compression_method: u4,
|
|
||||||
compression_info: u4,
|
|
||||||
|
|
||||||
const DEFLATE = 8;
|
|
||||||
const WINDOW_32K = 7;
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn DecompressStream(comptime ReaderType: type) type {
|
|
||||||
return struct {
|
|
||||||
const Self = @This();
|
|
||||||
|
|
||||||
pub const Error = ReaderType.Error ||
|
|
||||||
deflate.Decompressor(ReaderType).Error ||
|
|
||||||
error{ WrongChecksum, Unsupported };
|
|
||||||
pub const Reader = io.Reader(*Self, Error, read);
|
|
||||||
|
|
||||||
allocator: mem.Allocator,
|
|
||||||
inflater: deflate.Decompressor(ReaderType),
|
|
||||||
in_reader: ReaderType,
|
|
||||||
hasher: std.hash.Adler32,
|
|
||||||
|
|
||||||
fn init(allocator: mem.Allocator, source: ReaderType) !Self {
|
|
||||||
// Zlib header format is specified in RFC1950
|
|
||||||
const header_u16 = try source.readInt(u16, .big);
|
|
||||||
|
|
||||||
// verify the header checksum
|
|
||||||
if (header_u16 % 31 != 0)
|
|
||||||
return error.BadHeader;
|
|
||||||
const header = @as(ZLibHeader, @bitCast(header_u16));
|
|
||||||
|
|
||||||
// The CM field must be 8 to indicate the use of DEFLATE
|
|
||||||
if (header.compression_method != ZLibHeader.DEFLATE)
|
|
||||||
return error.InvalidCompression;
|
|
||||||
// CINFO is the base-2 logarithm of the LZ77 window size, minus 8.
|
|
||||||
// Values above 7 are unspecified and therefore rejected.
|
|
||||||
if (header.compression_info > ZLibHeader.WINDOW_32K)
|
|
||||||
return error.InvalidWindowSize;
|
|
||||||
|
|
||||||
const dictionary = null;
|
|
||||||
// TODO: Support this case
|
|
||||||
if (header.preset_dict != 0)
|
|
||||||
return error.Unsupported;
|
|
||||||
|
|
||||||
return Self{
|
|
||||||
.allocator = allocator,
|
|
||||||
.inflater = try deflate.decompressor(allocator, source, dictionary),
|
|
||||||
.in_reader = source,
|
|
||||||
.hasher = std.hash.Adler32.init(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *Self) void {
|
|
||||||
self.inflater.deinit();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Implements the io.Reader interface
|
|
||||||
pub fn read(self: *Self, buffer: []u8) Error!usize {
|
|
||||||
if (buffer.len == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// Read from the compressed stream and update the computed checksum
|
|
||||||
const r = try self.inflater.read(buffer);
|
|
||||||
if (r != 0) {
|
|
||||||
self.hasher.update(buffer[0..r]);
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We've reached the end of stream, check if the checksum matches
|
|
||||||
const hash = try self.in_reader.readInt(u32, .big);
|
|
||||||
if (hash != self.hasher.final())
|
|
||||||
return error.WrongChecksum;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn reader(self: *Self) Reader {
|
|
||||||
return .{ .context = self };
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn decompressStream(allocator: mem.Allocator, reader: anytype) !DecompressStream(@TypeOf(reader)) {
|
|
||||||
return DecompressStream(@TypeOf(reader)).init(allocator, reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const CompressionLevel = enum(u2) {
|
|
||||||
no_compression = 0,
|
|
||||||
fastest = 1,
|
|
||||||
default = 2,
|
|
||||||
maximum = 3,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub const CompressStreamOptions = struct {
|
|
||||||
level: CompressionLevel = .default,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn CompressStream(comptime WriterType: type) type {
|
|
||||||
return struct {
|
|
||||||
const Self = @This();
|
|
||||||
|
|
||||||
const Error = WriterType.Error ||
|
|
||||||
deflate.Compressor(WriterType).Error;
|
|
||||||
pub const Writer = io.Writer(*Self, Error, write);
|
|
||||||
|
|
||||||
allocator: mem.Allocator,
|
|
||||||
deflator: deflate.Compressor(WriterType),
|
|
||||||
in_writer: WriterType,
|
|
||||||
hasher: std.hash.Adler32,
|
|
||||||
|
|
||||||
fn init(allocator: mem.Allocator, dest: WriterType, options: CompressStreamOptions) !Self {
|
|
||||||
var header = ZLibHeader{
|
|
||||||
.compression_info = ZLibHeader.WINDOW_32K,
|
|
||||||
.compression_method = ZLibHeader.DEFLATE,
|
|
||||||
.compression_level = @intFromEnum(options.level),
|
|
||||||
.preset_dict = 0,
|
|
||||||
.checksum = 0,
|
|
||||||
};
|
|
||||||
header.checksum = @as(u5, @truncate(31 - @as(u16, @bitCast(header)) % 31));
|
|
||||||
|
|
||||||
try dest.writeInt(u16, @as(u16, @bitCast(header)), .big);
|
|
||||||
|
|
||||||
const compression_level: deflate.Compression = switch (options.level) {
|
|
||||||
.no_compression => .no_compression,
|
|
||||||
.fastest => .best_speed,
|
|
||||||
.default => .default_compression,
|
|
||||||
.maximum => .best_compression,
|
|
||||||
};
|
|
||||||
|
|
||||||
return Self{
|
|
||||||
.allocator = allocator,
|
|
||||||
.deflator = try deflate.compressor(allocator, dest, .{ .level = compression_level }),
|
|
||||||
.in_writer = dest,
|
|
||||||
.hasher = std.hash.Adler32.init(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn write(self: *Self, bytes: []const u8) Error!usize {
|
|
||||||
if (bytes.len == 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
const w = try self.deflator.write(bytes);
|
|
||||||
|
|
||||||
self.hasher.update(bytes[0..w]);
|
|
||||||
return w;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn writer(self: *Self) Writer {
|
|
||||||
return .{ .context = self };
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *Self) void {
|
|
||||||
self.deflator.deinit();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn finish(self: *Self) !void {
|
|
||||||
const hash = self.hasher.final();
|
|
||||||
try self.deflator.close();
|
|
||||||
try self.in_writer.writeInt(u32, hash, .big);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn compressStream(allocator: mem.Allocator, writer: anytype, options: CompressStreamOptions) !CompressStream(@TypeOf(writer)) {
|
|
||||||
return CompressStream(@TypeOf(writer)).init(allocator, writer, options);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn testDecompress(data: []const u8, expected: []const u8) !void {
|
|
||||||
var in_stream = io.fixedBufferStream(data);
|
|
||||||
|
|
||||||
var zlib_stream = try decompressStream(testing.allocator, in_stream.reader());
|
|
||||||
defer zlib_stream.deinit();
|
|
||||||
|
|
||||||
// Read and decompress the whole file
|
|
||||||
const buf = try zlib_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize));
|
|
||||||
defer testing.allocator.free(buf);
|
|
||||||
|
|
||||||
// Check against the reference
|
|
||||||
try testing.expectEqualSlices(u8, expected, buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
// All the test cases are obtained by compressing the RFC1951 text
|
|
||||||
//
|
|
||||||
// https://tools.ietf.org/rfc/rfc1951.txt length=36944 bytes
|
|
||||||
// SHA256=5ebf4b5b7fe1c3a0c0ab9aa3ac8c0f3853a7dc484905e76e03b0b0f301350009
|
|
||||||
test "compressed data" {
|
|
||||||
const rfc1951_txt = @embedFile("testdata/rfc1951.txt");
|
|
||||||
|
|
||||||
// Compressed with compression level = 0
|
|
||||||
try testDecompress(
|
|
||||||
@embedFile("testdata/rfc1951.txt.z.0"),
|
|
||||||
rfc1951_txt,
|
|
||||||
);
|
|
||||||
// Compressed with compression level = 9
|
|
||||||
try testDecompress(
|
|
||||||
@embedFile("testdata/rfc1951.txt.z.9"),
|
|
||||||
rfc1951_txt,
|
|
||||||
);
|
|
||||||
// Compressed with compression level = 9 and fixed Huffman codes
|
|
||||||
try testDecompress(
|
|
||||||
@embedFile("testdata/rfc1951.txt.fixed.z.9"),
|
|
||||||
rfc1951_txt,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "don't read past deflate stream's end" {
|
|
||||||
try testDecompress(&[_]u8{
|
|
||||||
0x08, 0xd7, 0x63, 0xf8, 0xcf, 0xc0, 0xc0, 0x00, 0xc1, 0xff,
|
|
||||||
0xff, 0x43, 0x30, 0x03, 0x03, 0xc3, 0xff, 0xff, 0xff, 0x01,
|
|
||||||
0x83, 0x95, 0x0b, 0xf5,
|
|
||||||
}, &[_]u8{
|
|
||||||
0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
|
|
||||||
0x00, 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
|
|
||||||
0x00, 0x00, 0xff, 0xff, 0xff,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
test "sanity checks" {
|
|
||||||
// Truncated header
|
|
||||||
try testing.expectError(
|
|
||||||
error.EndOfStream,
|
|
||||||
testDecompress(&[_]u8{0x78}, ""),
|
|
||||||
);
|
|
||||||
// Failed FCHECK check
|
|
||||||
try testing.expectError(
|
|
||||||
error.BadHeader,
|
|
||||||
testDecompress(&[_]u8{ 0x78, 0x9D }, ""),
|
|
||||||
);
|
|
||||||
// Wrong CM
|
|
||||||
try testing.expectError(
|
|
||||||
error.InvalidCompression,
|
|
||||||
testDecompress(&[_]u8{ 0x79, 0x94 }, ""),
|
|
||||||
);
|
|
||||||
// Wrong CINFO
|
|
||||||
try testing.expectError(
|
|
||||||
error.InvalidWindowSize,
|
|
||||||
testDecompress(&[_]u8{ 0x88, 0x98 }, ""),
|
|
||||||
);
|
|
||||||
// Wrong checksum
|
|
||||||
try testing.expectError(
|
|
||||||
error.WrongChecksum,
|
|
||||||
testDecompress(&[_]u8{ 0x78, 0xda, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00 }, ""),
|
|
||||||
);
|
|
||||||
// Truncated checksum
|
|
||||||
try testing.expectError(
|
|
||||||
error.EndOfStream,
|
|
||||||
testDecompress(&[_]u8{ 0x78, 0xda, 0x03, 0x00, 0x00 }, ""),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
test "compress data" {
|
|
||||||
const allocator = testing.allocator;
|
|
||||||
const rfc1951_txt = @embedFile("testdata/rfc1951.txt");
|
|
||||||
|
|
||||||
for (std.meta.tags(CompressionLevel)) |level| {
|
|
||||||
var compressed_data = std.ArrayList(u8).init(allocator);
|
|
||||||
defer compressed_data.deinit();
|
|
||||||
|
|
||||||
var compressor = try compressStream(allocator, compressed_data.writer(), .{ .level = level });
|
|
||||||
defer compressor.deinit();
|
|
||||||
|
|
||||||
try compressor.writer().writeAll(rfc1951_txt);
|
|
||||||
try compressor.finish();
|
|
||||||
|
|
||||||
try testDecompress(compressed_data.items, rfc1951_txt);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
x
Reference in New Issue
Block a user