const std = @import("std"); const io = std.io; const assert = std.debug.assert; const hc = @import("huffman_encoder.zig"); const consts = @import("consts.zig").huffman; const Token = @import("Token.zig"); const BitWriter = @import("bit_writer.zig").BitWriter; pub fn blockWriter(writer: anytype) BlockWriter(@TypeOf(writer)) { return BlockWriter(@TypeOf(writer)).init(writer); } /// Accepts list of tokens, decides what is best block type to write. What block /// type will provide best compression. Writes header and body of the block. /// pub fn BlockWriter(comptime WriterType: type) type { const BitWriterType = BitWriter(WriterType); return struct { const codegen_order = consts.codegen_order; const end_code_mark = 255; const Self = @This(); pub const Error = BitWriterType.Error; bit_writer: BitWriterType, codegen_freq: [consts.codegen_code_count]u16 = undefined, literal_freq: [consts.max_num_lit]u16 = undefined, distance_freq: [consts.distance_code_count]u16 = undefined, codegen: [consts.max_num_lit + consts.distance_code_count + 1]u8 = undefined, literal_encoding: hc.LiteralEncoder = .{}, distance_encoding: hc.DistanceEncoder = .{}, codegen_encoding: hc.CodegenEncoder = .{}, fixed_literal_encoding: hc.LiteralEncoder, fixed_distance_encoding: hc.DistanceEncoder, huff_distance: hc.DistanceEncoder, pub fn init(writer: WriterType) Self { return .{ .bit_writer = BitWriterType.init(writer), .fixed_literal_encoding = hc.fixedLiteralEncoder(), .fixed_distance_encoding = hc.fixedDistanceEncoder(), .huff_distance = hc.huffmanDistanceEncoder(), }; } /// Flush intrenal bit buffer to the writer. /// Should be called only when bit stream is at byte boundary. /// /// That is after final block; when last byte could be incomplete or /// after stored block; which is aligned to the byte boundary (it has x /// padding bits after first 3 bits). pub fn flush(self: *Self) Error!void { try self.bit_writer.flush(); } pub fn setWriter(self: *Self, new_writer: WriterType) void { self.bit_writer.setWriter(new_writer); } fn writeCode(self: *Self, c: hc.HuffCode) Error!void { try self.bit_writer.writeBits(c.code, c.len); } // RFC 1951 3.2.7 specifies a special run-length encoding for specifying // the literal and distance lengths arrays (which are concatenated into a single // array). This method generates that run-length encoding. // // The result is written into the codegen array, and the frequencies // of each code is written into the codegen_freq array. // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional // information. Code bad_code is an end marker // // num_literals: The number of literals in literal_encoding // num_distances: The number of distances in distance_encoding // lit_enc: The literal encoder to use // dist_enc: The distance encoder to use fn generateCodegen( self: *Self, num_literals: u32, num_distances: u32, lit_enc: *hc.LiteralEncoder, dist_enc: *hc.DistanceEncoder, ) void { for (self.codegen_freq, 0..) |_, i| { self.codegen_freq[i] = 0; } // Note that we are using codegen both as a temporary variable for holding // a copy of the frequencies, and as the place where we put the result. // This is fine because the output is always shorter than the input used // so far. var codegen = &self.codegen; // cache // Copy the concatenated code sizes to codegen. Put a marker at the end. var cgnl = codegen[0..num_literals]; for (cgnl, 0..) |_, i| { cgnl[i] = @as(u8, @intCast(lit_enc.codes[i].len)); } cgnl = codegen[num_literals .. num_literals + num_distances]; for (cgnl, 0..) |_, i| { cgnl[i] = @as(u8, @intCast(dist_enc.codes[i].len)); } codegen[num_literals + num_distances] = end_code_mark; var size = codegen[0]; var count: i32 = 1; var out_index: u32 = 0; var in_index: u32 = 1; while (size != end_code_mark) : (in_index += 1) { // INVARIANT: We have seen "count" copies of size that have not yet // had output generated for them. const next_size = codegen[in_index]; if (next_size == size) { count += 1; continue; } // We need to generate codegen indicating "count" of size. if (size != 0) { codegen[out_index] = size; out_index += 1; self.codegen_freq[size] += 1; count -= 1; while (count >= 3) { var n: i32 = 6; if (n > count) { n = count; } codegen[out_index] = 16; out_index += 1; codegen[out_index] = @as(u8, @intCast(n - 3)); out_index += 1; self.codegen_freq[16] += 1; count -= n; } } else { while (count >= 11) { var n: i32 = 138; if (n > count) { n = count; } codegen[out_index] = 18; out_index += 1; codegen[out_index] = @as(u8, @intCast(n - 11)); out_index += 1; self.codegen_freq[18] += 1; count -= n; } if (count >= 3) { // 3 <= count <= 10 codegen[out_index] = 17; out_index += 1; codegen[out_index] = @as(u8, @intCast(count - 3)); out_index += 1; self.codegen_freq[17] += 1; count = 0; } } count -= 1; while (count >= 0) : (count -= 1) { codegen[out_index] = size; out_index += 1; self.codegen_freq[size] += 1; } // Set up invariant for next time through the loop. size = next_size; count = 1; } // Marker indicating the end of the codegen. codegen[out_index] = end_code_mark; } const DynamicSize = struct { size: u32, num_codegens: u32, }; // dynamicSize returns the size of dynamically encoded data in bits. fn dynamicSize( self: *Self, lit_enc: *hc.LiteralEncoder, // literal encoder dist_enc: *hc.DistanceEncoder, // distance encoder extra_bits: u32, ) DynamicSize { var num_codegens = self.codegen_freq.len; while (num_codegens > 4 and self.codegen_freq[codegen_order[num_codegens - 1]] == 0) { num_codegens -= 1; } const header = 3 + 5 + 5 + 4 + (3 * num_codegens) + self.codegen_encoding.bitLength(self.codegen_freq[0..]) + self.codegen_freq[16] * 2 + self.codegen_freq[17] * 3 + self.codegen_freq[18] * 7; const size = header + lit_enc.bitLength(&self.literal_freq) + dist_enc.bitLength(&self.distance_freq) + extra_bits; return DynamicSize{ .size = @as(u32, @intCast(size)), .num_codegens = @as(u32, @intCast(num_codegens)), }; } // fixedSize returns the size of dynamically encoded data in bits. fn fixedSize(self: *Self, extra_bits: u32) u32 { return 3 + self.fixed_literal_encoding.bitLength(&self.literal_freq) + self.fixed_distance_encoding.bitLength(&self.distance_freq) + extra_bits; } const StoredSize = struct { size: u32, storable: bool, }; // storedSizeFits calculates the stored size, including header. // The function returns the size in bits and whether the block // fits inside a single block. fn storedSizeFits(in: ?[]const u8) StoredSize { if (in == null) { return .{ .size = 0, .storable = false }; } if (in.?.len <= consts.max_store_block_size) { return .{ .size = @as(u32, @intCast((in.?.len + 5) * 8)), .storable = true }; } return .{ .size = 0, .storable = false }; } // Write the header of a dynamic Huffman block to the output stream. // // num_literals: The number of literals specified in codegen // num_distances: The number of distances specified in codegen // num_codegens: The number of codegens used in codegen // eof: Is it the end-of-file? (end of stream) fn dynamicHeader( self: *Self, num_literals: u32, num_distances: u32, num_codegens: u32, eof: bool, ) Error!void { const first_bits: u32 = if (eof) 5 else 4; try self.bit_writer.writeBits(first_bits, 3); try self.bit_writer.writeBits(num_literals - 257, 5); try self.bit_writer.writeBits(num_distances - 1, 5); try self.bit_writer.writeBits(num_codegens - 4, 4); var i: u32 = 0; while (i < num_codegens) : (i += 1) { const value = self.codegen_encoding.codes[codegen_order[i]].len; try self.bit_writer.writeBits(value, 3); } i = 0; while (true) { const code_word: u32 = @as(u32, @intCast(self.codegen[i])); i += 1; if (code_word == end_code_mark) { break; } try self.writeCode(self.codegen_encoding.codes[@as(u32, @intCast(code_word))]); switch (code_word) { 16 => { try self.bit_writer.writeBits(self.codegen[i], 2); i += 1; }, 17 => { try self.bit_writer.writeBits(self.codegen[i], 3); i += 1; }, 18 => { try self.bit_writer.writeBits(self.codegen[i], 7); i += 1; }, else => {}, } } } fn storedHeader(self: *Self, length: usize, eof: bool) Error!void { assert(length <= 65535); const flag: u32 = if (eof) 1 else 0; try self.bit_writer.writeBits(flag, 3); try self.flush(); const l: u16 = @intCast(length); try self.bit_writer.writeBits(l, 16); try self.bit_writer.writeBits(~l, 16); } fn fixedHeader(self: *Self, eof: bool) Error!void { // Indicate that we are a fixed Huffman block var value: u32 = 2; if (eof) { value = 3; } try self.bit_writer.writeBits(value, 3); } // Write a block of tokens with the smallest encoding. Will choose block type. // The original input can be supplied, and if the huffman encoded data // is larger than the original bytes, the data will be written as a // stored block. // If the input is null, the tokens will always be Huffman encoded. pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) Error!void { const lit_and_dist = self.indexTokens(tokens); const num_literals = lit_and_dist.num_literals; const num_distances = lit_and_dist.num_distances; var extra_bits: u32 = 0; const ret = storedSizeFits(input); const stored_size = ret.size; const storable = ret.storable; if (storable) { // We only bother calculating the costs of the extra bits required by // the length of distance fields (which will be the same for both fixed // and dynamic encoding), if we need to compare those two encodings // against stored encoding. var length_code: u16 = Token.length_codes_start + 8; while (length_code < num_literals) : (length_code += 1) { // First eight length codes have extra size = 0. extra_bits += @as(u32, @intCast(self.literal_freq[length_code])) * @as(u32, @intCast(Token.lengthExtraBits(length_code))); } var distance_code: u16 = 4; while (distance_code < num_distances) : (distance_code += 1) { // First four distance codes have extra size = 0. extra_bits += @as(u32, @intCast(self.distance_freq[distance_code])) * @as(u32, @intCast(Token.distanceExtraBits(distance_code))); } } // Figure out smallest code. // Fixed Huffman baseline. var literal_encoding = &self.fixed_literal_encoding; var distance_encoding = &self.fixed_distance_encoding; var size = self.fixedSize(extra_bits); // Dynamic Huffman? var num_codegens: u32 = 0; // Generate codegen and codegenFrequencies, which indicates how to encode // the literal_encoding and the distance_encoding. self.generateCodegen( num_literals, num_distances, &self.literal_encoding, &self.distance_encoding, ); self.codegen_encoding.generate(self.codegen_freq[0..], 7); const dynamic_size = self.dynamicSize( &self.literal_encoding, &self.distance_encoding, extra_bits, ); const dyn_size = dynamic_size.size; num_codegens = dynamic_size.num_codegens; if (dyn_size < size) { size = dyn_size; literal_encoding = &self.literal_encoding; distance_encoding = &self.distance_encoding; } // Stored bytes? if (storable and stored_size < size) { try self.storedBlock(input.?, eof); return; } // Huffman. if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) { try self.fixedHeader(eof); } else { try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); } // Write the tokens. try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes); } pub fn storedBlock(self: *Self, input: []const u8, eof: bool) Error!void { try self.storedHeader(input.len, eof); try self.bit_writer.writeBytes(input); } // writeBlockDynamic encodes a block using a dynamic Huffman table. // This should be used if the symbols used have a disproportionate // histogram distribution. // If input is supplied and the compression savings are below 1/16th of the // input size the block is stored. fn dynamicBlock( self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8, ) Error!void { const total_tokens = self.indexTokens(tokens); const num_literals = total_tokens.num_literals; const num_distances = total_tokens.num_distances; // Generate codegen and codegenFrequencies, which indicates how to encode // the literal_encoding and the distance_encoding. self.generateCodegen( num_literals, num_distances, &self.literal_encoding, &self.distance_encoding, ); self.codegen_encoding.generate(self.codegen_freq[0..], 7); const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.distance_encoding, 0); const size = dynamic_size.size; const num_codegens = dynamic_size.num_codegens; // Store bytes, if we don't get a reasonable improvement. const stored_size = storedSizeFits(input); const ssize = stored_size.size; const storable = stored_size.storable; if (storable and ssize < (size + (size >> 4))) { try self.storedBlock(input.?, eof); return; } // Write Huffman table. try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); // Write the tokens. try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes); } const TotalIndexedTokens = struct { num_literals: u32, num_distances: u32, }; // Indexes a slice of tokens followed by an end_block_marker, and updates // literal_freq and distance_freq, and generates literal_encoding // and distance_encoding. // The number of literal and distance tokens is returned. fn indexTokens(self: *Self, tokens: []const Token) TotalIndexedTokens { var num_literals: u32 = 0; var num_distances: u32 = 0; for (self.literal_freq, 0..) |_, i| { self.literal_freq[i] = 0; } for (self.distance_freq, 0..) |_, i| { self.distance_freq[i] = 0; } for (tokens) |t| { if (t.kind == Token.Kind.literal) { self.literal_freq[t.literal()] += 1; continue; } self.literal_freq[t.lengthCode()] += 1; self.distance_freq[t.distanceCode()] += 1; } // add end_block_marker token at the end self.literal_freq[consts.end_block_marker] += 1; // get the number of literals num_literals = @as(u32, @intCast(self.literal_freq.len)); while (self.literal_freq[num_literals - 1] == 0) { num_literals -= 1; } // get the number of distances num_distances = @as(u32, @intCast(self.distance_freq.len)); while (num_distances > 0 and self.distance_freq[num_distances - 1] == 0) { num_distances -= 1; } if (num_distances == 0) { // We haven't found a single match. If we want to go with the dynamic encoding, // we should count at least one distance to be sure that the distance huffman tree could be encoded. self.distance_freq[0] = 1; num_distances = 1; } self.literal_encoding.generate(&self.literal_freq, 15); self.distance_encoding.generate(&self.distance_freq, 15); return TotalIndexedTokens{ .num_literals = num_literals, .num_distances = num_distances, }; } // Writes a slice of tokens to the output followed by and end_block_marker. // codes for literal and distance encoding must be supplied. fn writeTokens( self: *Self, tokens: []const Token, le_codes: []hc.HuffCode, oe_codes: []hc.HuffCode, ) Error!void { for (tokens) |t| { if (t.kind == Token.Kind.literal) { try self.writeCode(le_codes[t.literal()]); continue; } // Write the length const le = t.lengthEncoding(); try self.writeCode(le_codes[le.code]); if (le.extra_bits > 0) { try self.bit_writer.writeBits(le.extra_length, le.extra_bits); } // Write the distance const oe = t.distanceEncoding(); try self.writeCode(oe_codes[oe.code]); if (oe.extra_bits > 0) { try self.bit_writer.writeBits(oe.extra_distance, oe.extra_bits); } } // add end_block_marker at the end try self.writeCode(le_codes[consts.end_block_marker]); } // Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes // if the results only gains very little from compression. pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) Error!void { // Add everything as literals histogram(input, &self.literal_freq); self.literal_freq[consts.end_block_marker] = 1; const num_literals = consts.end_block_marker + 1; self.distance_freq[0] = 1; const num_distances = 1; self.literal_encoding.generate(&self.literal_freq, 15); // Figure out smallest code. // Always use dynamic Huffman or Store var num_codegens: u32 = 0; // Generate codegen and codegenFrequencies, which indicates how to encode // the literal_encoding and the distance_encoding. self.generateCodegen( num_literals, num_distances, &self.literal_encoding, &self.huff_distance, ); self.codegen_encoding.generate(self.codegen_freq[0..], 7); const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.huff_distance, 0); const size = dynamic_size.size; num_codegens = dynamic_size.num_codegens; // Store bytes, if we don't get a reasonable improvement. const stored_size_ret = storedSizeFits(input); const ssize = stored_size_ret.size; const storable = stored_size_ret.storable; if (storable and ssize < (size + (size >> 4))) { try self.storedBlock(input, eof); return; } // Huffman. try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); const encoding = self.literal_encoding.codes[0..257]; for (input) |t| { const c = encoding[t]; try self.bit_writer.writeBits(c.code, c.len); } try self.writeCode(encoding[consts.end_block_marker]); } // histogram accumulates a histogram of b in h. fn histogram(b: []const u8, h: *[286]u16) void { // Clear histogram for (h, 0..) |_, i| { h[i] = 0; } var lh = h.*[0..256]; for (b) |t| { lh[t] += 1; } } }; } // tests const expect = std.testing.expect; const fmt = std.fmt; const testing = std.testing; const ArrayList = std.ArrayList; const TestCase = @import("testdata/block_writer.zig").TestCase; const testCases = @import("testdata/block_writer.zig").testCases; // tests if the writeBlock encoding has changed. test "write" { inline for (0..testCases.len) |i| { try testBlock(testCases[i], .write_block); } } // tests if the writeBlockDynamic encoding has changed. test "dynamicBlock" { inline for (0..testCases.len) |i| { try testBlock(testCases[i], .write_dyn_block); } } test "huffmanBlock" { inline for (0..testCases.len) |i| { try testBlock(testCases[i], .write_huffman_block); } try testBlock(.{ .tokens = &[_]Token{}, .input = "huffman-rand-max.input", .want = "huffman-rand-max.{s}.expect", }, .write_huffman_block); } const TestFn = enum { write_block, write_dyn_block, // write dynamic block write_huffman_block, fn to_s(self: TestFn) []const u8 { return switch (self) { .write_block => "wb", .write_dyn_block => "dyn", .write_huffman_block => "huff", }; } fn write( comptime self: TestFn, bw: anytype, tok: []const Token, input: ?[]const u8, final: bool, ) !void { switch (self) { .write_block => try bw.write(tok, final, input), .write_dyn_block => try bw.dynamicBlock(tok, final, input), .write_huffman_block => try bw.huffmanBlock(input.?, final), } try bw.flush(); } }; // testBlock tests a block against its references // // size // 64K [file-name].input - input non compressed file // 8.1K [file-name].golden - // 78 [file-name].dyn.expect - output with writeBlockDynamic // 78 [file-name].wb.expect - output with writeBlock // 8.1K [file-name].huff.expect - output with writeBlockHuff // 78 [file-name].dyn.expect-noinput - output with writeBlockDynamic when input is null // 78 [file-name].wb.expect-noinput - output with writeBlock when input is null // // wb - writeBlock // dyn - writeBlockDynamic // huff - writeBlockHuff // fn testBlock(comptime tc: TestCase, comptime tfn: TestFn) !void { if (tc.input.len != 0 and tc.want.len != 0) { const want_name = comptime fmt.comptimePrint(tc.want, .{tfn.to_s()}); const input = @embedFile("testdata/block_writer/" ++ tc.input); const want = @embedFile("testdata/block_writer/" ++ want_name); try testWriteBlock(tfn, input, want, tc.tokens); } if (tfn == .write_huffman_block) { return; } const want_name_no_input = comptime fmt.comptimePrint(tc.want_no_input, .{tfn.to_s()}); const want = @embedFile("testdata/block_writer/" ++ want_name_no_input); try testWriteBlock(tfn, null, want, tc.tokens); } // Uses writer function `tfn` to write `tokens`, tests that we got `want` as output. fn testWriteBlock(comptime tfn: TestFn, input: ?[]const u8, want: []const u8, tokens: []const Token) !void { var buf = ArrayList(u8).init(testing.allocator); var bw = blockWriter(buf.writer()); try tfn.write(&bw, tokens, input, false); var got = buf.items; try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result try expect(got[0] & 0b0000_0001 == 0); // bfinal is not set // // Test if the writer produces the same output after reset. buf.deinit(); buf = ArrayList(u8).init(testing.allocator); defer buf.deinit(); bw.setWriter(buf.writer()); try tfn.write(&bw, tokens, input, true); try bw.flush(); got = buf.items; try expect(got[0] & 1 == 1); // bfinal is set buf.items[0] &= 0b1111_1110; // remove bfinal bit, so we can run test slices try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result }