// This encoding algorithm, which prioritizes speed over output size, is // based on Snappy's LZ77-style encoder: github.com/golang/snappy const std = @import("std"); const math = std.math; const mem = std.mem; const Allocator = std.mem.Allocator; const deflate_const = @import("deflate_const.zig"); const deflate = @import("compressor.zig"); const token = @import("token.zig"); const base_match_length = deflate_const.base_match_length; const base_match_offset = deflate_const.base_match_offset; const max_match_length = deflate_const.max_match_length; const max_match_offset = deflate_const.max_match_offset; const max_store_block_size = deflate_const.max_store_block_size; const table_bits = 14; // Bits used in the table. const table_mask = table_size - 1; // Mask for table indices. Redundant, but can eliminate bounds checks. const table_shift = 32 - table_bits; // Right-shift to get the table_bits most significant bits of a uint32. const table_size = 1 << table_bits; // Size of the table. // Reset the buffer offset when reaching this. // Offsets are stored between blocks as i32 values. // Since the offset we are checking against is at the beginning // of the buffer, we need to subtract the current and input // buffer to not risk overflowing the i32. const buffer_reset = math.maxInt(i32) - max_store_block_size * 2; fn load32(b: []u8, i: i32) u32 { const s = b[@as(usize, @intCast(i)) .. @as(usize, @intCast(i)) + 4]; return @as(u32, @intCast(s[0])) | @as(u32, @intCast(s[1])) << 8 | @as(u32, @intCast(s[2])) << 16 | @as(u32, @intCast(s[3])) << 24; } fn load64(b: []u8, i: i32) u64 { const s = b[@as(usize, @intCast(i))..@as(usize, @intCast(i + 8))]; return @as(u64, @intCast(s[0])) | @as(u64, @intCast(s[1])) << 8 | @as(u64, @intCast(s[2])) << 16 | @as(u64, @intCast(s[3])) << 24 | @as(u64, @intCast(s[4])) << 32 | @as(u64, @intCast(s[5])) << 40 | @as(u64, @intCast(s[6])) << 48 | @as(u64, @intCast(s[7])) << 56; } fn hash(u: u32) u32 { return (u *% 0x1e35a7bd) >> table_shift; } // These constants are defined by the Snappy implementation so that its // assembly implementation can fast-path some 16-bytes-at-a-time copies. // They aren't necessary in the pure Go implementation, and may not be // necessary in Zig, but using the same thresholds doesn't really hurt. const input_margin = 16 - 1; const min_non_literal_block_size = 1 + 1 + input_margin; const TableEntry = struct { val: u32, // Value at destination offset: i32, }; pub fn deflateFast() DeflateFast { return DeflateFast{ .table = [_]TableEntry{.{ .val = 0, .offset = 0 }} ** table_size, .prev = undefined, .prev_len = 0, .cur = max_store_block_size, .allocator = undefined, }; } // DeflateFast maintains the table for matches, // and the previous byte block for cross block matching. pub const DeflateFast = struct { table: [table_size]TableEntry, prev: []u8, // Previous block, zero length if unknown. prev_len: u32, // Previous block length cur: i32, // Current match offset. allocator: Allocator, const Self = @This(); pub fn init(self: *Self, allocator: Allocator) !void { self.allocator = allocator; self.prev = try allocator.alloc(u8, max_store_block_size); self.prev_len = 0; } pub fn deinit(self: *Self) void { self.allocator.free(self.prev); self.prev_len = 0; } // Encodes a block given in `src` and appends tokens to `dst` and returns the result. pub fn encode(self: *Self, dst: []token.Token, tokens_count: *u16, src: []u8) void { // Ensure that self.cur doesn't wrap. if (self.cur >= buffer_reset) { self.shiftOffsets(); } // This check isn't in the Snappy implementation, but there, the caller // instead of the callee handles this case. if (src.len < min_non_literal_block_size) { self.cur += max_store_block_size; self.prev_len = 0; emitLiteral(dst, tokens_count, src); return; } // s_limit is when to stop looking for offset/length copies. The input_margin // lets us use a fast path for emitLiteral in the main loop, while we are // looking for copies. const s_limit = @as(i32, @intCast(src.len - input_margin)); // next_emit is where in src the next emitLiteral should start from. var next_emit: i32 = 0; var s: i32 = 0; var cv: u32 = load32(src, s); var next_hash: u32 = hash(cv); outer: while (true) { // Copied from the C++ snappy implementation: // // Heuristic match skipping: If 32 bytes are scanned with no matches // found, start looking only at every other byte. If 32 more bytes are // scanned (or skipped), look at every third byte, etc.. When a match // is found, immediately go back to looking at every byte. This is a // small loss (~5% performance, ~0.1% density) for compressible data // due to more bookkeeping, but for non-compressible data (such as // JPEG) it's a huge win since the compressor quickly "realizes" the // data is incompressible and doesn't bother looking for matches // everywhere. // // The "skip" variable keeps track of how many bytes there are since // the last match; dividing it by 32 (ie. right-shifting by five) gives // the number of bytes to move ahead for each iteration. var skip: i32 = 32; var next_s: i32 = s; var candidate: TableEntry = undefined; while (true) { s = next_s; const bytes_between_hash_lookups = skip >> 5; next_s = s + bytes_between_hash_lookups; skip += bytes_between_hash_lookups; if (next_s > s_limit) { break :outer; } candidate = self.table[next_hash & table_mask]; const now = load32(src, next_s); self.table[next_hash & table_mask] = .{ .offset = s + self.cur, .val = cv }; next_hash = hash(now); const offset = s - (candidate.offset - self.cur); if (offset > max_match_offset or cv != candidate.val) { // Out of range or not matched. cv = now; continue; } break; } // A 4-byte match has been found. We'll later see if more than 4 bytes // match. But, prior to the match, src[next_emit..s] are unmatched. Emit // them as literal bytes. emitLiteral(dst, tokens_count, src[@as(usize, @intCast(next_emit))..@as(usize, @intCast(s))]); // Call emitCopy, and then see if another emitCopy could be our next // move. Repeat until we find no match for the input immediately after // what was consumed by the last emitCopy call. // // If we exit this loop normally then we need to call emitLiteral next, // though we don't yet know how big the literal will be. We handle that // by proceeding to the next iteration of the main loop. We also can // exit this loop via goto if we get close to exhausting the input. while (true) { // Invariant: we have a 4-byte match at s, and no need to emit any // literal bytes prior to s. // Extend the 4-byte match as long as possible. // s += 4; const t = candidate.offset - self.cur + 4; const l = self.matchLen(s, t, src); // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset) dst[tokens_count.*] = token.matchToken( @as(u32, @intCast(l + 4 - base_match_length)), @as(u32, @intCast(s - t - base_match_offset)), ); tokens_count.* += 1; s += l; next_emit = s; if (s >= s_limit) { break :outer; } // We could immediately start working at s now, but to improve // compression we first update the hash table at s-1 and at s. If // another emitCopy is not our next move, also calculate next_hash // at s+1. At least on amd64 architecture, these three hash calculations // are faster as one load64 call (with some shifts) instead of // three load32 calls. var x = load64(src, s - 1); const prev_hash = hash(@as(u32, @truncate(x))); self.table[prev_hash & table_mask] = TableEntry{ .offset = self.cur + s - 1, .val = @as(u32, @truncate(x)), }; x >>= 8; const curr_hash = hash(@as(u32, @truncate(x))); candidate = self.table[curr_hash & table_mask]; self.table[curr_hash & table_mask] = TableEntry{ .offset = self.cur + s, .val = @as(u32, @truncate(x)), }; const offset = s - (candidate.offset - self.cur); if (offset > max_match_offset or @as(u32, @truncate(x)) != candidate.val) { cv = @as(u32, @truncate(x >> 8)); next_hash = hash(cv); s += 1; break; } } } if (@as(u32, @intCast(next_emit)) < src.len) { emitLiteral(dst, tokens_count, src[@as(usize, @intCast(next_emit))..]); } self.cur += @as(i32, @intCast(src.len)); self.prev_len = @as(u32, @intCast(src.len)); @memcpy(self.prev[0..self.prev_len], src); return; } fn emitLiteral(dst: []token.Token, tokens_count: *u16, lit: []u8) void { for (lit) |v| { dst[tokens_count.*] = token.literalToken(@as(u32, @intCast(v))); tokens_count.* += 1; } return; } // matchLen returns the match length between src[s..] and src[t..]. // t can be negative to indicate the match is starting in self.prev. // We assume that src[s-4 .. s] and src[t-4 .. t] already match. fn matchLen(self: *Self, s: i32, t: i32, src: []u8) i32 { var s1 = @as(u32, @intCast(s)) + max_match_length - 4; if (s1 > src.len) { s1 = @as(u32, @intCast(src.len)); } // If we are inside the current block if (t >= 0) { var b = src[@as(usize, @intCast(t))..]; const a = src[@as(usize, @intCast(s))..@as(usize, @intCast(s1))]; b = b[0..a.len]; // Extend the match to be as long as possible. for (a, 0..) |_, i| { if (a[i] != b[i]) { return @as(i32, @intCast(i)); } } return @as(i32, @intCast(a.len)); } // We found a match in the previous block. const tp = @as(i32, @intCast(self.prev_len)) + t; if (tp < 0) { return 0; } // Extend the match to be as long as possible. var a = src[@as(usize, @intCast(s))..@as(usize, @intCast(s1))]; var b = self.prev[@as(usize, @intCast(tp))..@as(usize, @intCast(self.prev_len))]; if (b.len > a.len) { b = b[0..a.len]; } a = a[0..b.len]; for (b, 0..) |_, i| { if (a[i] != b[i]) { return @as(i32, @intCast(i)); } } // If we reached our limit, we matched everything we are // allowed to in the previous block and we return. const n = @as(i32, @intCast(b.len)); if (@as(u32, @intCast(s + n)) == s1) { return n; } // Continue looking for more matches in the current block. a = src[@as(usize, @intCast(s + n))..@as(usize, @intCast(s1))]; b = src[0..a.len]; for (a, 0..) |_, i| { if (a[i] != b[i]) { return @as(i32, @intCast(i)) + n; } } return @as(i32, @intCast(a.len)) + n; } // Reset resets the encoding history. // This ensures that no matches are made to the previous block. pub fn reset(self: *Self) void { self.prev_len = 0; // Bump the offset, so all matches will fail distance check. // Nothing should be >= self.cur in the table. self.cur += max_match_offset; // Protect against self.cur wraparound. if (self.cur >= buffer_reset) { self.shiftOffsets(); } } // shiftOffsets will shift down all match offset. // This is only called in rare situations to prevent integer overflow. // // See https://golang.org/issue/18636 and https://golang.org/issues/34121. fn shiftOffsets(self: *Self) void { if (self.prev_len == 0) { // We have no history; just clear the table. for (self.table, 0..) |_, i| { self.table[i] = TableEntry{ .val = 0, .offset = 0 }; } self.cur = max_match_offset + 1; return; } // Shift down everything in the table that isn't already too far away. for (self.table, 0..) |_, i| { var v = self.table[i].offset - self.cur + max_match_offset + 1; if (v < 0) { // We want to reset self.cur to max_match_offset + 1, so we need to shift // all table entries down by (self.cur - (max_match_offset + 1)). // Because we ignore matches > max_match_offset, we can cap // any negative offsets at 0. v = 0; } self.table[i].offset = v; } self.cur = max_match_offset + 1; } }; test "best speed match 1/3" { const expectEqual = std.testing.expectEqual; { var previous = [_]u8{ 0, 0, 0, 1, 2 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 3, 4, 5, 0, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(3, -3, ¤t); try expectEqual(@as(i32, 6), got); } { var previous = [_]u8{ 0, 0, 0, 1, 2 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 2, 4, 5, 0, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(3, -3, ¤t); try expectEqual(@as(i32, 3), got); } { var previous = [_]u8{ 0, 0, 0, 1, 1 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 3, 4, 5, 0, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(3, -3, ¤t); try expectEqual(@as(i32, 2), got); } { var previous = [_]u8{ 0, 0, 0, 1, 2 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(0, -1, ¤t); try expectEqual(@as(i32, 4), got); } { var previous = [_]u8{ 0, 0, 0, 1, 2, 3, 4, 5, 2, 2 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(4, -7, ¤t); try expectEqual(@as(i32, 5), got); } { var previous = [_]u8{ 9, 9, 9, 9, 9 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(0, -1, ¤t); try expectEqual(@as(i32, 0), got); } { var previous = [_]u8{ 9, 9, 9, 9, 9 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(1, 0, ¤t); try expectEqual(@as(i32, 0), got); } } test "best speed match 2/3" { const expectEqual = std.testing.expectEqual; { var previous = [_]u8{}; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(1, -5, ¤t); try expectEqual(@as(i32, 0), got); } { var previous = [_]u8{}; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 9, 2, 2, 2, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(1, -1, ¤t); try expectEqual(@as(i32, 0), got); } { var previous = [_]u8{}; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 2, 2, 2, 2, 1, 2, 3, 4, 5 }; const got: i32 = e.matchLen(1, 0, ¤t); try expectEqual(@as(i32, 3), got); } { var previous = [_]u8{ 3, 4, 5 }; var e = DeflateFast{ .prev = &previous, .prev_len = previous.len, .table = undefined, .allocator = undefined, .cur = 0, }; var current = [_]u8{ 3, 4, 5 }; const got: i32 = e.matchLen(0, -3, ¤t); try expectEqual(@as(i32, 3), got); } } test "best speed match 2/2" { const testing = std.testing; const expectEqual = testing.expectEqual; const Case = struct { previous: u32, current: u32, s: i32, t: i32, expected: i32, }; const cases = [_]Case{ .{ .previous = 1000, .current = 1000, .s = 0, .t = -1000, .expected = max_match_length - 4, }, .{ .previous = 200, .s = 0, .t = -200, .current = 500, .expected = max_match_length - 4, }, .{ .previous = 200, .s = 1, .t = 0, .current = 500, .expected = max_match_length - 4, }, .{ .previous = max_match_length - 4, .s = 0, .t = -(max_match_length - 4), .current = 500, .expected = max_match_length - 4, }, .{ .previous = 200, .s = 400, .t = -200, .current = 500, .expected = 100, }, .{ .previous = 10, .s = 400, .t = 200, .current = 500, .expected = 100, }, }; for (cases) |c| { const previous = try testing.allocator.alloc(u8, c.previous); defer testing.allocator.free(previous); @memset(previous, 0); const current = try testing.allocator.alloc(u8, c.current); defer testing.allocator.free(current); @memset(current, 0); var e = DeflateFast{ .prev = previous, .prev_len = @as(u32, @intCast(previous.len)), .table = undefined, .allocator = undefined, .cur = 0, }; const got: i32 = e.matchLen(c.s, c.t, current); try expectEqual(@as(i32, c.expected), got); } } test "best speed shift offsets" { const testing = std.testing; const expect = std.testing.expect; // Test if shiftoffsets properly preserves matches and resets out-of-range matches // seen in https://github.com/golang/go/issues/4142 var enc = deflateFast(); try enc.init(testing.allocator); defer enc.deinit(); // test_data may not generate internal matches. var test_data = [32]u8{ 0xf5, 0x25, 0xf2, 0x55, 0xf6, 0xc1, 0x1f, 0x0b, 0x10, 0xa1, 0xd0, 0x77, 0x56, 0x38, 0xf1, 0x9c, 0x7f, 0x85, 0xc5, 0xbd, 0x16, 0x28, 0xd4, 0xf9, 0x03, 0xd4, 0xc0, 0xa1, 0x1e, 0x58, 0x5b, 0xc9, }; var tokens = [_]token.Token{0} ** 32; var tokens_count: u16 = 0; // Encode the testdata with clean state. // Second part should pick up matches from the first block. tokens_count = 0; enc.encode(&tokens, &tokens_count, &test_data); const want_first_tokens = tokens_count; tokens_count = 0; enc.encode(&tokens, &tokens_count, &test_data); const want_second_tokens = tokens_count; try expect(want_first_tokens > want_second_tokens); // Forward the current indicator to before wraparound. enc.cur = buffer_reset - @as(i32, @intCast(test_data.len)); // Part 1 before wrap, should match clean state. tokens_count = 0; enc.encode(&tokens, &tokens_count, &test_data); var got = tokens_count; try testing.expectEqual(want_first_tokens, got); // Verify we are about to wrap. try testing.expectEqual(@as(i32, buffer_reset), enc.cur); // Part 2 should match clean state as well even if wrapped. tokens_count = 0; enc.encode(&tokens, &tokens_count, &test_data); got = tokens_count; try testing.expectEqual(want_second_tokens, got); // Verify that we wrapped. try expect(enc.cur < buffer_reset); // Forward the current buffer, leaving the matches at the bottom. enc.cur = buffer_reset; enc.shiftOffsets(); // Ensure that no matches were picked up. tokens_count = 0; enc.encode(&tokens, &tokens_count, &test_data); got = tokens_count; try testing.expectEqual(want_first_tokens, got); } test "best speed reset" { // test that encoding is consistent across a warparound of the table offset. // See https://github.com/golang/go/issues/34121 const fmt = std.fmt; const testing = std.testing; const ArrayList = std.ArrayList; const input_size = 65536; const input = try testing.allocator.alloc(u8, input_size); defer testing.allocator.free(input); var i: usize = 0; while (i < input_size) : (i += 1) { _ = try fmt.bufPrint(input, "asdfasdfasdfasdf{d}{d}fghfgujyut{d}yutyu\n", .{ i, i, i }); } // This is specific to level 1 (best_speed). const level = .best_speed; const offset: usize = 1; // We do an encode with a clean buffer to compare. var want = ArrayList(u8).init(testing.allocator); defer want.deinit(); var clean_comp = try deflate.compressor( testing.allocator, want.writer(), .{ .level = level }, ); defer clean_comp.deinit(); // Write 3 times, close. try clean_comp.writer().writeAll(input); try clean_comp.writer().writeAll(input); try clean_comp.writer().writeAll(input); try clean_comp.close(); var o = offset; while (o <= 256) : (o *= 2) { var discard = ArrayList(u8).init(testing.allocator); defer discard.deinit(); var comp = try deflate.compressor( testing.allocator, discard.writer(), .{ .level = level }, ); defer comp.deinit(); // Reset until we are right before the wraparound. // Each reset adds max_match_offset to the offset. i = 0; const limit = (buffer_reset - input.len - o - max_match_offset) / max_match_offset; while (i < limit) : (i += 1) { // skip ahead to where we are close to wrap around... comp.reset(discard.writer()); } var got = ArrayList(u8).init(testing.allocator); defer got.deinit(); comp.reset(got.writer()); // Write 3 times, close. try comp.writer().writeAll(input); try comp.writer().writeAll(input); try comp.writer().writeAll(input); try comp.close(); // output must match at wraparound try testing.expectEqualSlices(u8, want.items, got.items); } }