zig/lib/std/compress/deflate/dict_decoder.zig
2023-11-19 09:55:07 +00:00

424 lines
18 KiB
Zig

const std = @import("std");
const assert = std.debug.assert;
const mem = std.mem;
const Allocator = std.mem.Allocator;
// Implements the LZ77 sliding dictionary as used in decompression.
// LZ77 decompresses data through sequences of two forms of commands:
//
// * Literal insertions: Runs of one or more symbols are inserted into the data
// stream as is. This is accomplished through the writeByte method for a
// single symbol, or combinations of writeSlice/writeMark for multiple symbols.
// Any valid stream must start with a literal insertion if no preset dictionary
// is used.
//
// * Backward copies: Runs of one or more symbols are copied from previously
// emitted data. Backward copies come as the tuple (dist, length) where dist
// determines how far back in the stream to copy from and length determines how
// many bytes to copy. Note that it is valid for the length to be greater than
// the distance. Since LZ77 uses forward copies, that situation is used to
// perform a form of run-length encoding on repeated runs of symbols.
// The writeCopy and tryWriteCopy are used to implement this command.
//
// For performance reasons, this implementation performs little to no sanity
// checks about the arguments. As such, the invariants documented for each
// method call must be respected.
pub const DictDecoder = struct {
const Self = @This();
allocator: Allocator = undefined,
hist: []u8 = undefined, // Sliding window history
// Invariant: 0 <= rd_pos <= wr_pos <= hist.len
wr_pos: u32 = 0, // Current output position in buffer
rd_pos: u32 = 0, // Have emitted hist[0..rd_pos] already
full: bool = false, // Has a full window length been written yet?
// init initializes DictDecoder to have a sliding window dictionary of the given
// size. If a preset dict is provided, it will initialize the dictionary with
// the contents of dict.
pub fn init(self: *Self, allocator: Allocator, size: u32, dict: ?[]const u8) !void {
self.allocator = allocator;
self.hist = try allocator.alloc(u8, size);
self.wr_pos = 0;
if (dict != null) {
const src = dict.?[dict.?.len -| self.hist.len..];
@memcpy(self.hist[0..src.len], src);
self.wr_pos = @as(u32, @intCast(dict.?.len));
}
if (self.wr_pos == self.hist.len) {
self.wr_pos = 0;
self.full = true;
}
self.rd_pos = self.wr_pos;
}
pub fn deinit(self: *Self) void {
self.allocator.free(self.hist);
}
// Reports the total amount of historical data in the dictionary.
pub fn histSize(self: *Self) u32 {
if (self.full) {
return @as(u32, @intCast(self.hist.len));
}
return self.wr_pos;
}
// Reports the number of bytes that can be flushed by readFlush.
pub fn availRead(self: *Self) u32 {
return self.wr_pos - self.rd_pos;
}
// Reports the available amount of output buffer space.
pub fn availWrite(self: *Self) u32 {
return @as(u32, @intCast(self.hist.len - self.wr_pos));
}
// Returns a slice of the available buffer to write data to.
//
// This invariant will be kept: s.len <= availWrite()
pub fn writeSlice(self: *Self) []u8 {
return self.hist[self.wr_pos..];
}
// Advances the writer pointer by `count`.
//
// This invariant must be kept: 0 <= count <= availWrite()
pub fn writeMark(self: *Self, count: u32) void {
assert(0 <= count and count <= self.availWrite());
self.wr_pos += count;
}
// Writes a single byte to the dictionary.
//
// This invariant must be kept: 0 < availWrite()
pub fn writeByte(self: *Self, byte: u8) void {
self.hist[self.wr_pos] = byte;
self.wr_pos += 1;
}
/// TODO: eliminate this function because the callsites should care about whether
/// or not their arguments alias and then they should directly call `@memcpy` or
/// `mem.copyForwards`.
fn copy(dst: []u8, src: []const u8) u32 {
if (src.len > dst.len) {
mem.copyForwards(u8, dst, src[0..dst.len]);
return @as(u32, @intCast(dst.len));
}
mem.copyForwards(u8, dst[0..src.len], src);
return @as(u32, @intCast(src.len));
}
// Copies a string at a given (dist, length) to the output.
// This returns the number of bytes copied and may be less than the requested
// length if the available space in the output buffer is too small.
//
// This invariant must be kept: 0 < dist <= histSize()
pub fn writeCopy(self: *Self, dist: u32, length: u32) u32 {
assert(0 < dist and dist <= self.histSize());
const dst_base = self.wr_pos;
var dst_pos = dst_base;
var src_pos: i32 = @as(i32, @intCast(dst_pos)) - @as(i32, @intCast(dist));
var end_pos = dst_pos + length;
if (end_pos > self.hist.len) {
end_pos = @as(u32, @intCast(self.hist.len));
}
// Copy non-overlapping section after destination position.
//
// This section is non-overlapping in that the copy length for this section
// is always less than or equal to the backwards distance. This can occur
// if a distance refers to data that wraps-around in the buffer.
// Thus, a backwards copy is performed here; that is, the exact bytes in
// the source prior to the copy is placed in the destination.
if (src_pos < 0) {
src_pos += @as(i32, @intCast(self.hist.len));
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[@as(usize, @intCast(src_pos))..]);
src_pos = 0;
}
// Copy possibly overlapping section before destination position.
//
// This section can overlap if the copy length for this section is larger
// than the backwards distance. This is allowed by LZ77 so that repeated
// strings can be succinctly represented using (dist, length) pairs.
// Thus, a forwards copy is performed here; that is, the bytes copied is
// possibly dependent on the resulting bytes in the destination as the copy
// progresses along. This is functionally equivalent to the following:
//
// var i = 0;
// while(i < end_pos - dst_pos) : (i+=1) {
// self.hist[dst_pos+i] = self.hist[src_pos+i];
// }
// dst_pos = end_pos;
//
while (dst_pos < end_pos) {
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[@as(usize, @intCast(src_pos))..dst_pos]);
}
self.wr_pos = dst_pos;
return dst_pos - dst_base;
}
// Tries to copy a string at a given (distance, length) to the
// output. This specialized version is optimized for short distances.
//
// This method is designed to be inlined for performance reasons.
//
// This invariant must be kept: 0 < dist <= histSize()
pub fn tryWriteCopy(self: *Self, dist: u32, length: u32) u32 {
var dst_pos = self.wr_pos;
const end_pos = dst_pos + length;
if (dst_pos < dist or end_pos > self.hist.len) {
return 0;
}
const dst_base = dst_pos;
const src_pos = dst_pos - dist;
// Copy possibly overlapping section before destination position.
while (dst_pos < end_pos) {
dst_pos += copy(self.hist[dst_pos..end_pos], self.hist[src_pos..dst_pos]);
}
self.wr_pos = dst_pos;
return dst_pos - dst_base;
}
// Returns a slice of the historical buffer that is ready to be
// emitted to the user. The data returned by readFlush must be fully consumed
// before calling any other DictDecoder methods.
pub fn readFlush(self: *Self) []u8 {
const to_read = self.hist[self.rd_pos..self.wr_pos];
self.rd_pos = self.wr_pos;
if (self.wr_pos == self.hist.len) {
self.wr_pos = 0;
self.rd_pos = 0;
self.full = true;
}
return to_read;
}
};
// tests
test "dictionary decoder" {
const ArrayList = std.ArrayList;
const testing = std.testing;
const abc = "ABC\n";
const fox = "The quick brown fox jumped over the lazy dog!\n";
const poem: []const u8 =
\\The Road Not Taken
\\Robert Frost
\\
\\Two roads diverged in a yellow wood,
\\And sorry I could not travel both
\\And be one traveler, long I stood
\\And looked down one as far as I could
\\To where it bent in the undergrowth;
\\
\\Then took the other, as just as fair,
\\And having perhaps the better claim,
\\Because it was grassy and wanted wear;
\\Though as for that the passing there
\\Had worn them really about the same,
\\
\\And both that morning equally lay
\\In leaves no step had trodden black.
\\Oh, I kept the first for another day!
\\Yet knowing how way leads on to way,
\\I doubted if I should ever come back.
\\
\\I shall be telling this with a sigh
\\Somewhere ages and ages hence:
\\Two roads diverged in a wood, and I-
\\I took the one less traveled by,
\\And that has made all the difference.
\\
;
const uppercase: []const u8 =
\\THE ROAD NOT TAKEN
\\ROBERT FROST
\\
\\TWO ROADS DIVERGED IN A YELLOW WOOD,
\\AND SORRY I COULD NOT TRAVEL BOTH
\\AND BE ONE TRAVELER, LONG I STOOD
\\AND LOOKED DOWN ONE AS FAR AS I COULD
\\TO WHERE IT BENT IN THE UNDERGROWTH;
\\
\\THEN TOOK THE OTHER, AS JUST AS FAIR,
\\AND HAVING PERHAPS THE BETTER CLAIM,
\\BECAUSE IT WAS GRASSY AND WANTED WEAR;
\\THOUGH AS FOR THAT THE PASSING THERE
\\HAD WORN THEM REALLY ABOUT THE SAME,
\\
\\AND BOTH THAT MORNING EQUALLY LAY
\\IN LEAVES NO STEP HAD TRODDEN BLACK.
\\OH, I KEPT THE FIRST FOR ANOTHER DAY!
\\YET KNOWING HOW WAY LEADS ON TO WAY,
\\I DOUBTED IF I SHOULD EVER COME BACK.
\\
\\I SHALL BE TELLING THIS WITH A SIGH
\\SOMEWHERE AGES AND AGES HENCE:
\\TWO ROADS DIVERGED IN A WOOD, AND I-
\\I TOOK THE ONE LESS TRAVELED BY,
\\AND THAT HAS MADE ALL THE DIFFERENCE.
\\
;
const PoemRefs = struct {
dist: u32, // Backward distance (0 if this is an insertion)
length: u32, // Length of copy or insertion
};
const poem_refs = [_]PoemRefs{
.{ .dist = 0, .length = 38 }, .{ .dist = 33, .length = 3 }, .{ .dist = 0, .length = 48 },
.{ .dist = 79, .length = 3 }, .{ .dist = 0, .length = 11 }, .{ .dist = 34, .length = 5 },
.{ .dist = 0, .length = 6 }, .{ .dist = 23, .length = 7 }, .{ .dist = 0, .length = 8 },
.{ .dist = 50, .length = 3 }, .{ .dist = 0, .length = 2 }, .{ .dist = 69, .length = 3 },
.{ .dist = 34, .length = 5 }, .{ .dist = 0, .length = 4 }, .{ .dist = 97, .length = 3 },
.{ .dist = 0, .length = 4 }, .{ .dist = 43, .length = 5 }, .{ .dist = 0, .length = 6 },
.{ .dist = 7, .length = 4 }, .{ .dist = 88, .length = 7 }, .{ .dist = 0, .length = 12 },
.{ .dist = 80, .length = 3 }, .{ .dist = 0, .length = 2 }, .{ .dist = 141, .length = 4 },
.{ .dist = 0, .length = 1 }, .{ .dist = 196, .length = 3 }, .{ .dist = 0, .length = 3 },
.{ .dist = 157, .length = 3 }, .{ .dist = 0, .length = 6 }, .{ .dist = 181, .length = 3 },
.{ .dist = 0, .length = 2 }, .{ .dist = 23, .length = 3 }, .{ .dist = 77, .length = 3 },
.{ .dist = 28, .length = 5 }, .{ .dist = 128, .length = 3 }, .{ .dist = 110, .length = 4 },
.{ .dist = 70, .length = 3 }, .{ .dist = 0, .length = 4 }, .{ .dist = 85, .length = 6 },
.{ .dist = 0, .length = 2 }, .{ .dist = 182, .length = 6 }, .{ .dist = 0, .length = 4 },
.{ .dist = 133, .length = 3 }, .{ .dist = 0, .length = 7 }, .{ .dist = 47, .length = 5 },
.{ .dist = 0, .length = 20 }, .{ .dist = 112, .length = 5 }, .{ .dist = 0, .length = 1 },
.{ .dist = 58, .length = 3 }, .{ .dist = 0, .length = 8 }, .{ .dist = 59, .length = 3 },
.{ .dist = 0, .length = 4 }, .{ .dist = 173, .length = 3 }, .{ .dist = 0, .length = 5 },
.{ .dist = 114, .length = 3 }, .{ .dist = 0, .length = 4 }, .{ .dist = 92, .length = 5 },
.{ .dist = 0, .length = 2 }, .{ .dist = 71, .length = 3 }, .{ .dist = 0, .length = 2 },
.{ .dist = 76, .length = 5 }, .{ .dist = 0, .length = 1 }, .{ .dist = 46, .length = 3 },
.{ .dist = 96, .length = 4 }, .{ .dist = 130, .length = 4 }, .{ .dist = 0, .length = 3 },
.{ .dist = 360, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 178, .length = 5 },
.{ .dist = 0, .length = 7 }, .{ .dist = 75, .length = 3 }, .{ .dist = 0, .length = 3 },
.{ .dist = 45, .length = 6 }, .{ .dist = 0, .length = 6 }, .{ .dist = 299, .length = 6 },
.{ .dist = 180, .length = 3 }, .{ .dist = 70, .length = 6 }, .{ .dist = 0, .length = 1 },
.{ .dist = 48, .length = 3 }, .{ .dist = 66, .length = 4 }, .{ .dist = 0, .length = 3 },
.{ .dist = 47, .length = 5 }, .{ .dist = 0, .length = 9 }, .{ .dist = 325, .length = 3 },
.{ .dist = 0, .length = 1 }, .{ .dist = 359, .length = 3 }, .{ .dist = 318, .length = 3 },
.{ .dist = 0, .length = 2 }, .{ .dist = 199, .length = 3 }, .{ .dist = 0, .length = 1 },
.{ .dist = 344, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 248, .length = 3 },
.{ .dist = 0, .length = 10 }, .{ .dist = 310, .length = 3 }, .{ .dist = 0, .length = 3 },
.{ .dist = 93, .length = 6 }, .{ .dist = 0, .length = 3 }, .{ .dist = 252, .length = 3 },
.{ .dist = 157, .length = 4 }, .{ .dist = 0, .length = 2 }, .{ .dist = 273, .length = 5 },
.{ .dist = 0, .length = 14 }, .{ .dist = 99, .length = 4 }, .{ .dist = 0, .length = 1 },
.{ .dist = 464, .length = 4 }, .{ .dist = 0, .length = 2 }, .{ .dist = 92, .length = 4 },
.{ .dist = 495, .length = 3 }, .{ .dist = 0, .length = 1 }, .{ .dist = 322, .length = 4 },
.{ .dist = 16, .length = 4 }, .{ .dist = 0, .length = 3 }, .{ .dist = 402, .length = 3 },
.{ .dist = 0, .length = 2 }, .{ .dist = 237, .length = 4 }, .{ .dist = 0, .length = 2 },
.{ .dist = 432, .length = 4 }, .{ .dist = 0, .length = 1 }, .{ .dist = 483, .length = 5 },
.{ .dist = 0, .length = 2 }, .{ .dist = 294, .length = 4 }, .{ .dist = 0, .length = 2 },
.{ .dist = 306, .length = 3 }, .{ .dist = 113, .length = 5 }, .{ .dist = 0, .length = 1 },
.{ .dist = 26, .length = 4 }, .{ .dist = 164, .length = 3 }, .{ .dist = 488, .length = 4 },
.{ .dist = 0, .length = 1 }, .{ .dist = 542, .length = 3 }, .{ .dist = 248, .length = 6 },
.{ .dist = 0, .length = 5 }, .{ .dist = 205, .length = 3 }, .{ .dist = 0, .length = 8 },
.{ .dist = 48, .length = 3 }, .{ .dist = 449, .length = 6 }, .{ .dist = 0, .length = 2 },
.{ .dist = 192, .length = 3 }, .{ .dist = 328, .length = 4 }, .{ .dist = 9, .length = 5 },
.{ .dist = 433, .length = 3 }, .{ .dist = 0, .length = 3 }, .{ .dist = 622, .length = 25 },
.{ .dist = 615, .length = 5 }, .{ .dist = 46, .length = 5 }, .{ .dist = 0, .length = 2 },
.{ .dist = 104, .length = 3 }, .{ .dist = 475, .length = 10 }, .{ .dist = 549, .length = 3 },
.{ .dist = 0, .length = 4 }, .{ .dist = 597, .length = 8 }, .{ .dist = 314, .length = 3 },
.{ .dist = 0, .length = 1 }, .{ .dist = 473, .length = 6 }, .{ .dist = 317, .length = 5 },
.{ .dist = 0, .length = 1 }, .{ .dist = 400, .length = 3 }, .{ .dist = 0, .length = 3 },
.{ .dist = 109, .length = 3 }, .{ .dist = 151, .length = 3 }, .{ .dist = 48, .length = 4 },
.{ .dist = 0, .length = 4 }, .{ .dist = 125, .length = 3 }, .{ .dist = 108, .length = 3 },
.{ .dist = 0, .length = 2 },
};
var got_list = ArrayList(u8).init(testing.allocator);
defer got_list.deinit();
var got = got_list.writer();
var want_list = ArrayList(u8).init(testing.allocator);
defer want_list.deinit();
var want = want_list.writer();
var dd = DictDecoder{};
try dd.init(testing.allocator, 1 << 11, null);
defer dd.deinit();
const util = struct {
fn writeCopy(dst_dd: *DictDecoder, dst: anytype, dist: u32, length: u32) !void {
var len = length;
while (len > 0) {
var n = dst_dd.tryWriteCopy(dist, len);
if (n == 0) {
n = dst_dd.writeCopy(dist, len);
}
len -= n;
if (dst_dd.availWrite() == 0) {
_ = try dst.write(dst_dd.readFlush());
}
}
}
fn writeString(dst_dd: *DictDecoder, dst: anytype, str: []const u8) !void {
var string = str;
while (string.len > 0) {
const cnt = DictDecoder.copy(dst_dd.writeSlice(), string);
dst_dd.writeMark(cnt);
string = string[cnt..];
if (dst_dd.availWrite() == 0) {
_ = try dst.write(dst_dd.readFlush());
}
}
}
};
try util.writeString(&dd, got, ".");
_ = try want.write(".");
var str = poem;
for (poem_refs, 0..) |ref, i| {
_ = i;
if (ref.dist == 0) {
try util.writeString(&dd, got, str[0..ref.length]);
} else {
try util.writeCopy(&dd, got, ref.dist, ref.length);
}
str = str[ref.length..];
}
_ = try want.write(poem);
try util.writeCopy(&dd, got, dd.histSize(), 33);
_ = try want.write(want_list.items[0..33]);
try util.writeString(&dd, got, abc);
try util.writeCopy(&dd, got, abc.len, 59 * abc.len);
_ = try want.write(abc ** 60);
try util.writeString(&dd, got, fox);
try util.writeCopy(&dd, got, fox.len, 9 * fox.len);
_ = try want.write(fox ** 10);
try util.writeString(&dd, got, ".");
try util.writeCopy(&dd, got, 1, 9);
_ = try want.write("." ** 10);
try util.writeString(&dd, got, uppercase);
try util.writeCopy(&dd, got, uppercase.len, 7 * uppercase.len);
var i: u8 = 0;
while (i < 8) : (i += 1) {
_ = try want.write(uppercase);
}
try util.writeCopy(&dd, got, dd.histSize(), 10);
_ = try want.write(want_list.items[want_list.items.len - dd.histSize() ..][0..10]);
_ = try got.write(dd.readFlush());
try testing.expectEqualSlices(u8, want_list.items, got_list.items);
}