From ea0ce7afb59d7c7ed33f707f3fea3e0babd785b6 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 22 Aug 2025 14:03:44 -0700 Subject: [PATCH 01/12] std.compress: flatten lzma and lzma2 namespaces --- lib/std/compress/lzma.zig | 726 +++++++++++++++++++- lib/std/compress/lzma/decode.zig | 379 ---------- lib/std/compress/lzma/decode/lzbuffer.zig | 228 ------ lib/std/compress/lzma/decode/rangecoder.zig | 181 ----- lib/std/compress/lzma/vec2d.zig | 128 ---- lib/std/compress/lzma2.zig | 268 +++++++- lib/std/compress/lzma2/decode.zig | 169 ----- 7 files changed, 981 insertions(+), 1098 deletions(-) delete mode 100644 lib/std/compress/lzma/decode.zig delete mode 100644 lib/std/compress/lzma/decode/lzbuffer.zig delete mode 100644 lib/std/compress/lzma/decode/rangecoder.zig delete mode 100644 lib/std/compress/lzma/vec2d.zig delete mode 100644 lib/std/compress/lzma2/decode.zig diff --git a/lib/std/compress/lzma.zig b/lib/std/compress/lzma.zig index 5d072f5381..c40eeeb56e 100644 --- a/lib/std/compress/lzma.zig +++ b/lib/std/compress/lzma.zig @@ -2,8 +2,713 @@ const std = @import("../std.zig"); const math = std.math; const mem = std.mem; const Allocator = std.mem.Allocator; +const assert = std.debug.assert; +const ArrayList = std.ArrayList; -pub const decode = @import("lzma/decode.zig"); +pub const RangeDecoder = struct { + range: u32, + code: u32, + + pub fn init(reader: anytype) !RangeDecoder { + const reserved = try reader.readByte(); + if (reserved != 0) { + return error.CorruptInput; + } + return RangeDecoder{ + .range = 0xFFFF_FFFF, + .code = try reader.readInt(u32, .big), + }; + } + + pub fn fromParts( + range: u32, + code: u32, + ) RangeDecoder { + return .{ + .range = range, + .code = code, + }; + } + + pub fn set(self: *RangeDecoder, range: u32, code: u32) void { + self.range = range; + self.code = code; + } + + pub inline fn isFinished(self: RangeDecoder) bool { + return self.code == 0; + } + + inline fn normalize(self: *RangeDecoder, reader: anytype) !void { + if (self.range < 0x0100_0000) { + self.range <<= 8; + self.code = (self.code << 8) ^ @as(u32, try reader.readByte()); + } + } + + inline fn getBit(self: *RangeDecoder, reader: anytype) !bool { + self.range >>= 1; + + const bit = self.code >= self.range; + if (bit) + self.code -= self.range; + + try self.normalize(reader); + return bit; + } + + pub fn get(self: *RangeDecoder, reader: anytype, count: usize) !u32 { + var result: u32 = 0; + var i: usize = 0; + while (i < count) : (i += 1) + result = (result << 1) ^ @intFromBool(try self.getBit(reader)); + return result; + } + + pub inline fn decodeBit(self: *RangeDecoder, reader: anytype, prob: *u16, update: bool) !bool { + const bound = (self.range >> 11) * prob.*; + + if (self.code < bound) { + if (update) + prob.* += (0x800 - prob.*) >> 5; + self.range = bound; + + try self.normalize(reader); + return false; + } else { + if (update) + prob.* -= prob.* >> 5; + self.code -= bound; + self.range -= bound; + + try self.normalize(reader); + return true; + } + } + + fn parseBitTree( + self: *RangeDecoder, + reader: anytype, + num_bits: u5, + probs: []u16, + update: bool, + ) !u32 { + var tmp: u32 = 1; + var i: @TypeOf(num_bits) = 0; + while (i < num_bits) : (i += 1) { + const bit = try self.decodeBit(reader, &probs[tmp], update); + tmp = (tmp << 1) ^ @intFromBool(bit); + } + return tmp - (@as(u32, 1) << num_bits); + } + + pub fn parseReverseBitTree( + self: *RangeDecoder, + reader: anytype, + num_bits: u5, + probs: []u16, + offset: usize, + update: bool, + ) !u32 { + var result: u32 = 0; + var tmp: usize = 1; + var i: @TypeOf(num_bits) = 0; + while (i < num_bits) : (i += 1) { + const bit = @intFromBool(try self.decodeBit(reader, &probs[offset + tmp], update)); + tmp = (tmp << 1) ^ bit; + result ^= @as(u32, bit) << i; + } + return result; + } +}; + +pub const Decode = struct { + lzma_props: Properties, + unpacked_size: ?u64, + literal_probs: Vec2d, + pos_slot_decoder: [4]BitTree(6), + align_decoder: BitTree(4), + pos_decoders: [115]u16, + is_match: [192]u16, + is_rep: [12]u16, + is_rep_g0: [12]u16, + is_rep_g1: [12]u16, + is_rep_g2: [12]u16, + is_rep_0long: [192]u16, + state: usize, + rep: [4]usize, + len_decoder: LenDecoder, + rep_len_decoder: LenDecoder, + + pub fn init( + allocator: Allocator, + lzma_props: Properties, + unpacked_size: ?u64, + ) !Decode { + return .{ + .lzma_props = lzma_props, + .unpacked_size = unpacked_size, + .literal_probs = try Vec2d.init(allocator, 0x400, .{ @as(usize, 1) << (lzma_props.lc + lzma_props.lp), 0x300 }), + .pos_slot_decoder = @splat(.{}), + .align_decoder = .{}, + .pos_decoders = @splat(0x400), + .is_match = @splat(0x400), + .is_rep = @splat(0x400), + .is_rep_g0 = @splat(0x400), + .is_rep_g1 = @splat(0x400), + .is_rep_g2 = @splat(0x400), + .is_rep_0long = @splat(0x400), + .state = 0, + .rep = @splat(0), + .len_decoder = .{}, + .rep_len_decoder = .{}, + }; + } + + pub fn deinit(self: *Decode, allocator: Allocator) void { + self.literal_probs.deinit(allocator); + self.* = undefined; + } + + pub fn resetState(self: *Decode, allocator: Allocator, new_props: Properties) !void { + new_props.validate(); + if (self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp) { + self.literal_probs.fill(0x400); + } else { + self.literal_probs.deinit(allocator); + self.literal_probs = try Vec2d.init(allocator, 0x400, .{ @as(usize, 1) << (new_props.lc + new_props.lp), 0x300 }); + } + + self.lzma_props = new_props; + for (&self.pos_slot_decoder) |*t| t.reset(); + self.align_decoder.reset(); + self.pos_decoders = @splat(0x400); + self.is_match = @splat(0x400); + self.is_rep = @splat(0x400); + self.is_rep_g0 = @splat(0x400); + self.is_rep_g1 = @splat(0x400); + self.is_rep_g2 = @splat(0x400); + self.is_rep_0long = @splat(0x400); + self.state = 0; + self.rep = @splat(0); + self.len_decoder.reset(); + self.rep_len_decoder.reset(); + } + + fn processNextInner( + self: *Decode, + allocator: Allocator, + reader: anytype, + writer: anytype, + buffer: anytype, + decoder: *RangeDecoder, + update: bool, + ) !ProcessingStatus { + const pos_state = buffer.len & ((@as(usize, 1) << self.lzma_props.pb) - 1); + + if (!try decoder.decodeBit( + reader, + &self.is_match[(self.state << 4) + pos_state], + update, + )) { + const byte: u8 = try self.decodeLiteral(reader, buffer, decoder, update); + + if (update) { + try buffer.appendLiteral(allocator, byte, writer); + + self.state = if (self.state < 4) + 0 + else if (self.state < 10) + self.state - 3 + else + self.state - 6; + } + return .continue_; + } + + var len: usize = undefined; + if (try decoder.decodeBit(reader, &self.is_rep[self.state], update)) { + if (!try decoder.decodeBit(reader, &self.is_rep_g0[self.state], update)) { + if (!try decoder.decodeBit( + reader, + &self.is_rep_0long[(self.state << 4) + pos_state], + update, + )) { + if (update) { + self.state = if (self.state < 7) 9 else 11; + const dist = self.rep[0] + 1; + try buffer.appendLz(allocator, 1, dist, writer); + } + return .continue_; + } + } else { + const idx: usize = if (!try decoder.decodeBit(reader, &self.is_rep_g1[self.state], update)) + 1 + else if (!try decoder.decodeBit(reader, &self.is_rep_g2[self.state], update)) + 2 + else + 3; + if (update) { + const dist = self.rep[idx]; + var i = idx; + while (i > 0) : (i -= 1) { + self.rep[i] = self.rep[i - 1]; + } + self.rep[0] = dist; + } + } + + len = try self.rep_len_decoder.decode(reader, decoder, pos_state, update); + + if (update) { + self.state = if (self.state < 7) 8 else 11; + } + } else { + if (update) { + self.rep[3] = self.rep[2]; + self.rep[2] = self.rep[1]; + self.rep[1] = self.rep[0]; + } + + len = try self.len_decoder.decode(reader, decoder, pos_state, update); + + if (update) { + self.state = if (self.state < 7) 7 else 10; + } + + const rep_0 = try self.decodeDistance(reader, decoder, len, update); + + if (update) { + self.rep[0] = rep_0; + if (self.rep[0] == 0xFFFF_FFFF) { + if (decoder.isFinished()) { + return .finished; + } + return error.CorruptInput; + } + } + } + + if (update) { + len += 2; + + const dist = self.rep[0] + 1; + try buffer.appendLz(allocator, len, dist, writer); + } + + return .continue_; + } + + fn processNext( + self: *Decode, + allocator: Allocator, + reader: anytype, + writer: anytype, + buffer: anytype, + decoder: *RangeDecoder, + ) !ProcessingStatus { + return self.processNextInner(allocator, reader, writer, buffer, decoder, true); + } + + pub fn process( + self: *Decode, + allocator: Allocator, + reader: anytype, + writer: anytype, + buffer: anytype, + decoder: *RangeDecoder, + ) !ProcessingStatus { + process_next: { + if (self.unpacked_size) |unpacked_size| { + if (buffer.len >= unpacked_size) { + break :process_next; + } + } else if (decoder.isFinished()) { + break :process_next; + } + + switch (try self.processNext(allocator, reader, writer, buffer, decoder)) { + .continue_ => return .continue_, + .finished => break :process_next, + } + } + + if (self.unpacked_size) |unpacked_size| { + if (buffer.len != unpacked_size) { + return error.CorruptInput; + } + } + + return .finished; + } + + fn decodeLiteral( + self: *Decode, + reader: anytype, + buffer: anytype, + decoder: *RangeDecoder, + update: bool, + ) !u8 { + const def_prev_byte = 0; + const prev_byte = @as(usize, buffer.lastOr(def_prev_byte)); + + var result: usize = 1; + const lit_state = ((buffer.len & ((@as(usize, 1) << self.lzma_props.lp) - 1)) << self.lzma_props.lc) + + (prev_byte >> (8 - self.lzma_props.lc)); + const probs = try self.literal_probs.getMut(lit_state); + + if (self.state >= 7) { + var match_byte = @as(usize, try buffer.lastN(self.rep[0] + 1)); + + while (result < 0x100) { + const match_bit = (match_byte >> 7) & 1; + match_byte <<= 1; + const bit = @intFromBool(try decoder.decodeBit( + reader, + &probs[((@as(usize, 1) + match_bit) << 8) + result], + update, + )); + result = (result << 1) ^ bit; + if (match_bit != bit) { + break; + } + } + } + + while (result < 0x100) { + result = (result << 1) ^ @intFromBool(try decoder.decodeBit(reader, &probs[result], update)); + } + + return @as(u8, @truncate(result - 0x100)); + } + + fn decodeDistance( + self: *Decode, + reader: anytype, + decoder: *RangeDecoder, + length: usize, + update: bool, + ) !usize { + const len_state = if (length > 3) 3 else length; + + const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(reader, decoder, update)); + if (pos_slot < 4) + return pos_slot; + + const num_direct_bits = @as(u5, @intCast((pos_slot >> 1) - 1)); + var result = (2 ^ (pos_slot & 1)) << num_direct_bits; + + if (pos_slot < 14) { + result += try decoder.parseReverseBitTree( + reader, + num_direct_bits, + &self.pos_decoders, + result - pos_slot, + update, + ); + } else { + result += @as(usize, try decoder.get(reader, num_direct_bits - 4)) << 4; + result += try self.align_decoder.parseReverse(reader, decoder, update); + } + + return result; + } + + /// A circular buffer for LZ sequences + pub const LzCircularBuffer = struct { + /// Circular buffer + buf: ArrayList(u8), + + /// Length of the buffer + dict_size: usize, + + /// Buffer memory limit + memlimit: usize, + + /// Current position + cursor: usize, + + /// Total number of bytes sent through the buffer + len: usize, + + const Self = @This(); + + pub fn init(dict_size: usize, memlimit: usize) Self { + return Self{ + .buf = .{}, + .dict_size = dict_size, + .memlimit = memlimit, + .cursor = 0, + .len = 0, + }; + } + + pub fn get(self: Self, index: usize) u8 { + return if (0 <= index and index < self.buf.items.len) + self.buf.items[index] + else + 0; + } + + pub fn set(self: *Self, allocator: Allocator, index: usize, value: u8) !void { + if (index >= self.memlimit) { + return error.CorruptInput; + } + try self.buf.ensureTotalCapacity(allocator, index + 1); + while (self.buf.items.len < index) { + self.buf.appendAssumeCapacity(0); + } + self.buf.appendAssumeCapacity(value); + } + + /// Retrieve the last byte or return a default + pub fn lastOr(self: Self, lit: u8) u8 { + return if (self.len == 0) + lit + else + self.get((self.dict_size + self.cursor - 1) % self.dict_size); + } + + /// Retrieve the n-th last byte + pub fn lastN(self: Self, dist: usize) !u8 { + if (dist > self.dict_size or dist > self.len) { + return error.CorruptInput; + } + + const offset = (self.dict_size + self.cursor - dist) % self.dict_size; + return self.get(offset); + } + + /// Append a literal + pub fn appendLiteral( + self: *Self, + allocator: Allocator, + lit: u8, + writer: anytype, + ) !void { + try self.set(allocator, self.cursor, lit); + self.cursor += 1; + self.len += 1; + + // Flush the circular buffer to the output + if (self.cursor == self.dict_size) { + try writer.writeAll(self.buf.items); + self.cursor = 0; + } + } + + /// Fetch an LZ sequence (length, distance) from inside the buffer + pub fn appendLz( + self: *Self, + allocator: Allocator, + len: usize, + dist: usize, + writer: anytype, + ) !void { + if (dist > self.dict_size or dist > self.len) { + return error.CorruptInput; + } + + var offset = (self.dict_size + self.cursor - dist) % self.dict_size; + var i: usize = 0; + while (i < len) : (i += 1) { + const x = self.get(offset); + try self.appendLiteral(allocator, x, writer); + offset += 1; + if (offset == self.dict_size) { + offset = 0; + } + } + } + + pub fn finish(self: *Self, writer: anytype) !void { + if (self.cursor > 0) { + try writer.writeAll(self.buf.items[0..self.cursor]); + self.cursor = 0; + } + } + + pub fn deinit(self: *Self, allocator: Allocator) void { + self.buf.deinit(allocator); + self.* = undefined; + } + }; + + pub fn BitTree(comptime num_bits: usize) type { + return struct { + probs: [1 << num_bits]u16 = @splat(0x400), + + const Self = @This(); + + pub fn parse( + self: *Self, + reader: anytype, + decoder: *RangeDecoder, + update: bool, + ) !u32 { + return decoder.parseBitTree(reader, num_bits, &self.probs, update); + } + + pub fn parseReverse( + self: *Self, + reader: anytype, + decoder: *RangeDecoder, + update: bool, + ) !u32 { + return decoder.parseReverseBitTree(reader, num_bits, &self.probs, 0, update); + } + + pub fn reset(self: *Self) void { + @memset(&self.probs, 0x400); + } + }; + } + + pub const LenDecoder = struct { + choice: u16 = 0x400, + choice2: u16 = 0x400, + low_coder: [16]BitTree(3) = @splat(.{}), + mid_coder: [16]BitTree(3) = @splat(.{}), + high_coder: BitTree(8) = .{}, + + pub fn decode( + self: *LenDecoder, + reader: anytype, + decoder: *RangeDecoder, + pos_state: usize, + update: bool, + ) !usize { + if (!try decoder.decodeBit(reader, &self.choice, update)) { + return @as(usize, try self.low_coder[pos_state].parse(reader, decoder, update)); + } else if (!try decoder.decodeBit(reader, &self.choice2, update)) { + return @as(usize, try self.mid_coder[pos_state].parse(reader, decoder, update)) + 8; + } else { + return @as(usize, try self.high_coder.parse(reader, decoder, update)) + 16; + } + } + + pub fn reset(self: *LenDecoder) void { + self.choice = 0x400; + self.choice2 = 0x400; + for (&self.low_coder) |*t| t.reset(); + for (&self.mid_coder) |*t| t.reset(); + self.high_coder.reset(); + } + }; + + pub const Vec2d = struct { + data: []u16, + cols: usize, + + const Self = @This(); + + pub fn init(allocator: Allocator, value: u16, size: struct { usize, usize }) !Self { + const len = try math.mul(usize, size[0], size[1]); + const data = try allocator.alloc(u16, len); + @memset(data, value); + return Self{ + .data = data, + .cols = size[1], + }; + } + + pub fn deinit(self: *Self, allocator: Allocator) void { + allocator.free(self.data); + self.* = undefined; + } + + pub fn fill(self: *Self, value: u16) void { + @memset(self.data, value); + } + + inline fn _get(self: Self, row: usize) ![]u16 { + const start_row = try math.mul(usize, row, self.cols); + const end_row = try math.add(usize, start_row, self.cols); + return self.data[start_row..end_row]; + } + + pub fn get(self: Self, row: usize) ![]const u16 { + return self._get(row); + } + + pub fn getMut(self: *Self, row: usize) ![]u16 { + return self._get(row); + } + }; + + pub const Options = struct { + unpacked_size: UnpackedSize = .read_from_header, + memlimit: ?usize = null, + allow_incomplete: bool = false, + }; + + pub const UnpackedSize = union(enum) { + read_from_header, + read_header_but_use_provided: ?u64, + use_provided: ?u64, + }; + + const ProcessingStatus = enum { + continue_, + finished, + }; + + pub const Properties = struct { + lc: u4, + lp: u3, + pb: u3, + + fn validate(self: Properties) void { + assert(self.lc <= 8); + assert(self.lp <= 4); + assert(self.pb <= 4); + } + }; + + pub const Params = struct { + properties: Properties, + dict_size: u32, + unpacked_size: ?u64, + + pub fn readHeader(reader: anytype, options: Options) !Params { + var props = try reader.readByte(); + if (props >= 225) { + return error.CorruptInput; + } + + const lc = @as(u4, @intCast(props % 9)); + props /= 9; + const lp = @as(u3, @intCast(props % 5)); + props /= 5; + const pb = @as(u3, @intCast(props)); + + const dict_size_provided = try reader.readInt(u32, .little); + const dict_size = @max(0x1000, dict_size_provided); + + const unpacked_size = switch (options.unpacked_size) { + .read_from_header => blk: { + const unpacked_size_provided = try reader.readInt(u64, .little); + const marker_mandatory = unpacked_size_provided == 0xFFFF_FFFF_FFFF_FFFF; + break :blk if (marker_mandatory) + null + else + unpacked_size_provided; + }, + .read_header_but_use_provided => |x| blk: { + _ = try reader.readInt(u64, .little); + break :blk x; + }, + .use_provided => |x| x, + }; + + return Params{ + .properties = Properties{ .lc = lc, .lp = lp, .pb = pb }, + .dict_size = dict_size, + .unpacked_size = unpacked_size, + }; + } + }; +}; pub fn decompress( allocator: Allocator, @@ -15,9 +720,9 @@ pub fn decompress( pub fn decompressWithOptions( allocator: Allocator, reader: anytype, - options: decode.Options, + options: Decode.Options, ) !Decompress(@TypeOf(reader)) { - const params = try decode.Params.readHeader(reader, options); + const params = try Decode.Params.readHeader(reader, options); return Decompress(@TypeOf(reader)).init(allocator, reader, params, options.memlimit); } @@ -36,19 +741,19 @@ pub fn Decompress(comptime ReaderType: type) type { in_reader: ReaderType, to_read: std.ArrayListUnmanaged(u8), - buffer: decode.lzbuffer.LzCircularBuffer, - decoder: decode.rangecoder.RangeDecoder, - state: decode.DecoderState, + buffer: Decode.LzCircularBuffer, + decoder: RangeDecoder, + state: Decode, - pub fn init(allocator: Allocator, source: ReaderType, params: decode.Params, memlimit: ?usize) !Self { + pub fn init(allocator: Allocator, source: ReaderType, params: Decode.Params, memlimit: ?usize) !Self { return Self{ .allocator = allocator, .in_reader = source, .to_read = .{}, - .buffer = decode.lzbuffer.LzCircularBuffer.init(params.dict_size, memlimit orelse math.maxInt(usize)), - .decoder = try decode.rangecoder.RangeDecoder.init(source), - .state = try decode.DecoderState.init(allocator, params.properties, params.unpacked_size), + .buffer = Decode.LzCircularBuffer.init(params.dict_size, memlimit orelse math.maxInt(usize)), + .decoder = try RangeDecoder.init(source), + .state = try Decode.init(allocator, params.properties, params.unpacked_size), }; } @@ -86,5 +791,4 @@ pub fn Decompress(comptime ReaderType: type) type { test { _ = @import("lzma/test.zig"); - _ = @import("lzma/vec2d.zig"); } diff --git a/lib/std/compress/lzma/decode.zig b/lib/std/compress/lzma/decode.zig deleted file mode 100644 index ac45eb52b3..0000000000 --- a/lib/std/compress/lzma/decode.zig +++ /dev/null @@ -1,379 +0,0 @@ -const std = @import("../../std.zig"); -const assert = std.debug.assert; -const math = std.math; -const Allocator = std.mem.Allocator; - -pub const lzbuffer = @import("decode/lzbuffer.zig"); -pub const rangecoder = @import("decode/rangecoder.zig"); - -const LzCircularBuffer = lzbuffer.LzCircularBuffer; -const BitTree = rangecoder.BitTree; -const LenDecoder = rangecoder.LenDecoder; -const RangeDecoder = rangecoder.RangeDecoder; -const Vec2D = @import("vec2d.zig").Vec2D; - -pub const Options = struct { - unpacked_size: UnpackedSize = .read_from_header, - memlimit: ?usize = null, - allow_incomplete: bool = false, -}; - -pub const UnpackedSize = union(enum) { - read_from_header, - read_header_but_use_provided: ?u64, - use_provided: ?u64, -}; - -const ProcessingStatus = enum { - continue_, - finished, -}; - -pub const Properties = struct { - lc: u4, - lp: u3, - pb: u3, - - fn validate(self: Properties) void { - assert(self.lc <= 8); - assert(self.lp <= 4); - assert(self.pb <= 4); - } -}; - -pub const Params = struct { - properties: Properties, - dict_size: u32, - unpacked_size: ?u64, - - pub fn readHeader(reader: anytype, options: Options) !Params { - var props = try reader.readByte(); - if (props >= 225) { - return error.CorruptInput; - } - - const lc = @as(u4, @intCast(props % 9)); - props /= 9; - const lp = @as(u3, @intCast(props % 5)); - props /= 5; - const pb = @as(u3, @intCast(props)); - - const dict_size_provided = try reader.readInt(u32, .little); - const dict_size = @max(0x1000, dict_size_provided); - - const unpacked_size = switch (options.unpacked_size) { - .read_from_header => blk: { - const unpacked_size_provided = try reader.readInt(u64, .little); - const marker_mandatory = unpacked_size_provided == 0xFFFF_FFFF_FFFF_FFFF; - break :blk if (marker_mandatory) - null - else - unpacked_size_provided; - }, - .read_header_but_use_provided => |x| blk: { - _ = try reader.readInt(u64, .little); - break :blk x; - }, - .use_provided => |x| x, - }; - - return Params{ - .properties = Properties{ .lc = lc, .lp = lp, .pb = pb }, - .dict_size = dict_size, - .unpacked_size = unpacked_size, - }; - } -}; - -pub const DecoderState = struct { - lzma_props: Properties, - unpacked_size: ?u64, - literal_probs: Vec2D(u16), - pos_slot_decoder: [4]BitTree(6), - align_decoder: BitTree(4), - pos_decoders: [115]u16, - is_match: [192]u16, - is_rep: [12]u16, - is_rep_g0: [12]u16, - is_rep_g1: [12]u16, - is_rep_g2: [12]u16, - is_rep_0long: [192]u16, - state: usize, - rep: [4]usize, - len_decoder: LenDecoder, - rep_len_decoder: LenDecoder, - - pub fn init( - allocator: Allocator, - lzma_props: Properties, - unpacked_size: ?u64, - ) !DecoderState { - return .{ - .lzma_props = lzma_props, - .unpacked_size = unpacked_size, - .literal_probs = try Vec2D(u16).init(allocator, 0x400, .{ @as(usize, 1) << (lzma_props.lc + lzma_props.lp), 0x300 }), - .pos_slot_decoder = @splat(.{}), - .align_decoder = .{}, - .pos_decoders = @splat(0x400), - .is_match = @splat(0x400), - .is_rep = @splat(0x400), - .is_rep_g0 = @splat(0x400), - .is_rep_g1 = @splat(0x400), - .is_rep_g2 = @splat(0x400), - .is_rep_0long = @splat(0x400), - .state = 0, - .rep = @splat(0), - .len_decoder = .{}, - .rep_len_decoder = .{}, - }; - } - - pub fn deinit(self: *DecoderState, allocator: Allocator) void { - self.literal_probs.deinit(allocator); - self.* = undefined; - } - - pub fn resetState(self: *DecoderState, allocator: Allocator, new_props: Properties) !void { - new_props.validate(); - if (self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp) { - self.literal_probs.fill(0x400); - } else { - self.literal_probs.deinit(allocator); - self.literal_probs = try Vec2D(u16).init(allocator, 0x400, .{ @as(usize, 1) << (new_props.lc + new_props.lp), 0x300 }); - } - - self.lzma_props = new_props; - for (&self.pos_slot_decoder) |*t| t.reset(); - self.align_decoder.reset(); - self.pos_decoders = @splat(0x400); - self.is_match = @splat(0x400); - self.is_rep = @splat(0x400); - self.is_rep_g0 = @splat(0x400); - self.is_rep_g1 = @splat(0x400); - self.is_rep_g2 = @splat(0x400); - self.is_rep_0long = @splat(0x400); - self.state = 0; - self.rep = @splat(0); - self.len_decoder.reset(); - self.rep_len_decoder.reset(); - } - - fn processNextInner( - self: *DecoderState, - allocator: Allocator, - reader: anytype, - writer: anytype, - buffer: anytype, - decoder: *RangeDecoder, - update: bool, - ) !ProcessingStatus { - const pos_state = buffer.len & ((@as(usize, 1) << self.lzma_props.pb) - 1); - - if (!try decoder.decodeBit( - reader, - &self.is_match[(self.state << 4) + pos_state], - update, - )) { - const byte: u8 = try self.decodeLiteral(reader, buffer, decoder, update); - - if (update) { - try buffer.appendLiteral(allocator, byte, writer); - - self.state = if (self.state < 4) - 0 - else if (self.state < 10) - self.state - 3 - else - self.state - 6; - } - return .continue_; - } - - var len: usize = undefined; - if (try decoder.decodeBit(reader, &self.is_rep[self.state], update)) { - if (!try decoder.decodeBit(reader, &self.is_rep_g0[self.state], update)) { - if (!try decoder.decodeBit( - reader, - &self.is_rep_0long[(self.state << 4) + pos_state], - update, - )) { - if (update) { - self.state = if (self.state < 7) 9 else 11; - const dist = self.rep[0] + 1; - try buffer.appendLz(allocator, 1, dist, writer); - } - return .continue_; - } - } else { - const idx: usize = if (!try decoder.decodeBit(reader, &self.is_rep_g1[self.state], update)) - 1 - else if (!try decoder.decodeBit(reader, &self.is_rep_g2[self.state], update)) - 2 - else - 3; - if (update) { - const dist = self.rep[idx]; - var i = idx; - while (i > 0) : (i -= 1) { - self.rep[i] = self.rep[i - 1]; - } - self.rep[0] = dist; - } - } - - len = try self.rep_len_decoder.decode(reader, decoder, pos_state, update); - - if (update) { - self.state = if (self.state < 7) 8 else 11; - } - } else { - if (update) { - self.rep[3] = self.rep[2]; - self.rep[2] = self.rep[1]; - self.rep[1] = self.rep[0]; - } - - len = try self.len_decoder.decode(reader, decoder, pos_state, update); - - if (update) { - self.state = if (self.state < 7) 7 else 10; - } - - const rep_0 = try self.decodeDistance(reader, decoder, len, update); - - if (update) { - self.rep[0] = rep_0; - if (self.rep[0] == 0xFFFF_FFFF) { - if (decoder.isFinished()) { - return .finished; - } - return error.CorruptInput; - } - } - } - - if (update) { - len += 2; - - const dist = self.rep[0] + 1; - try buffer.appendLz(allocator, len, dist, writer); - } - - return .continue_; - } - - fn processNext( - self: *DecoderState, - allocator: Allocator, - reader: anytype, - writer: anytype, - buffer: anytype, - decoder: *RangeDecoder, - ) !ProcessingStatus { - return self.processNextInner(allocator, reader, writer, buffer, decoder, true); - } - - pub fn process( - self: *DecoderState, - allocator: Allocator, - reader: anytype, - writer: anytype, - buffer: anytype, - decoder: *RangeDecoder, - ) !ProcessingStatus { - process_next: { - if (self.unpacked_size) |unpacked_size| { - if (buffer.len >= unpacked_size) { - break :process_next; - } - } else if (decoder.isFinished()) { - break :process_next; - } - - switch (try self.processNext(allocator, reader, writer, buffer, decoder)) { - .continue_ => return .continue_, - .finished => break :process_next, - } - } - - if (self.unpacked_size) |unpacked_size| { - if (buffer.len != unpacked_size) { - return error.CorruptInput; - } - } - - return .finished; - } - - fn decodeLiteral( - self: *DecoderState, - reader: anytype, - buffer: anytype, - decoder: *RangeDecoder, - update: bool, - ) !u8 { - const def_prev_byte = 0; - const prev_byte = @as(usize, buffer.lastOr(def_prev_byte)); - - var result: usize = 1; - const lit_state = ((buffer.len & ((@as(usize, 1) << self.lzma_props.lp) - 1)) << self.lzma_props.lc) + - (prev_byte >> (8 - self.lzma_props.lc)); - const probs = try self.literal_probs.getMut(lit_state); - - if (self.state >= 7) { - var match_byte = @as(usize, try buffer.lastN(self.rep[0] + 1)); - - while (result < 0x100) { - const match_bit = (match_byte >> 7) & 1; - match_byte <<= 1; - const bit = @intFromBool(try decoder.decodeBit( - reader, - &probs[((@as(usize, 1) + match_bit) << 8) + result], - update, - )); - result = (result << 1) ^ bit; - if (match_bit != bit) { - break; - } - } - } - - while (result < 0x100) { - result = (result << 1) ^ @intFromBool(try decoder.decodeBit(reader, &probs[result], update)); - } - - return @as(u8, @truncate(result - 0x100)); - } - - fn decodeDistance( - self: *DecoderState, - reader: anytype, - decoder: *RangeDecoder, - length: usize, - update: bool, - ) !usize { - const len_state = if (length > 3) 3 else length; - - const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(reader, decoder, update)); - if (pos_slot < 4) - return pos_slot; - - const num_direct_bits = @as(u5, @intCast((pos_slot >> 1) - 1)); - var result = (2 ^ (pos_slot & 1)) << num_direct_bits; - - if (pos_slot < 14) { - result += try decoder.parseReverseBitTree( - reader, - num_direct_bits, - &self.pos_decoders, - result - pos_slot, - update, - ); - } else { - result += @as(usize, try decoder.get(reader, num_direct_bits - 4)) << 4; - result += try self.align_decoder.parseReverse(reader, decoder, update); - } - - return result; - } -}; diff --git a/lib/std/compress/lzma/decode/lzbuffer.zig b/lib/std/compress/lzma/decode/lzbuffer.zig deleted file mode 100644 index 80c470c5f9..0000000000 --- a/lib/std/compress/lzma/decode/lzbuffer.zig +++ /dev/null @@ -1,228 +0,0 @@ -const std = @import("../../../std.zig"); -const math = std.math; -const mem = std.mem; -const Allocator = std.mem.Allocator; -const ArrayListUnmanaged = std.ArrayListUnmanaged; - -/// An accumulating buffer for LZ sequences -pub const LzAccumBuffer = struct { - /// Buffer - buf: ArrayListUnmanaged(u8), - - /// Buffer memory limit - memlimit: usize, - - /// Total number of bytes sent through the buffer - len: usize, - - const Self = @This(); - - pub fn init(memlimit: usize) Self { - return Self{ - .buf = .{}, - .memlimit = memlimit, - .len = 0, - }; - } - - pub fn appendByte(self: *Self, allocator: Allocator, byte: u8) !void { - try self.buf.append(allocator, byte); - self.len += 1; - } - - /// Reset the internal dictionary - pub fn reset(self: *Self, writer: anytype) !void { - try writer.writeAll(self.buf.items); - self.buf.clearRetainingCapacity(); - self.len = 0; - } - - /// Retrieve the last byte or return a default - pub fn lastOr(self: Self, lit: u8) u8 { - const buf_len = self.buf.items.len; - return if (buf_len == 0) - lit - else - self.buf.items[buf_len - 1]; - } - - /// Retrieve the n-th last byte - pub fn lastN(self: Self, dist: usize) !u8 { - const buf_len = self.buf.items.len; - if (dist > buf_len) { - return error.CorruptInput; - } - - return self.buf.items[buf_len - dist]; - } - - /// Append a literal - pub fn appendLiteral( - self: *Self, - allocator: Allocator, - lit: u8, - writer: anytype, - ) !void { - _ = writer; - if (self.len >= self.memlimit) { - return error.CorruptInput; - } - try self.buf.append(allocator, lit); - self.len += 1; - } - - /// Fetch an LZ sequence (length, distance) from inside the buffer - pub fn appendLz( - self: *Self, - allocator: Allocator, - len: usize, - dist: usize, - writer: anytype, - ) !void { - _ = writer; - - const buf_len = self.buf.items.len; - if (dist > buf_len) { - return error.CorruptInput; - } - - var offset = buf_len - dist; - var i: usize = 0; - while (i < len) : (i += 1) { - const x = self.buf.items[offset]; - try self.buf.append(allocator, x); - offset += 1; - } - self.len += len; - } - - pub fn finish(self: *Self, writer: anytype) !void { - try writer.writeAll(self.buf.items); - self.buf.clearRetainingCapacity(); - } - - pub fn deinit(self: *Self, allocator: Allocator) void { - self.buf.deinit(allocator); - self.* = undefined; - } -}; - -/// A circular buffer for LZ sequences -pub const LzCircularBuffer = struct { - /// Circular buffer - buf: ArrayListUnmanaged(u8), - - /// Length of the buffer - dict_size: usize, - - /// Buffer memory limit - memlimit: usize, - - /// Current position - cursor: usize, - - /// Total number of bytes sent through the buffer - len: usize, - - const Self = @This(); - - pub fn init(dict_size: usize, memlimit: usize) Self { - return Self{ - .buf = .{}, - .dict_size = dict_size, - .memlimit = memlimit, - .cursor = 0, - .len = 0, - }; - } - - pub fn get(self: Self, index: usize) u8 { - return if (0 <= index and index < self.buf.items.len) - self.buf.items[index] - else - 0; - } - - pub fn set(self: *Self, allocator: Allocator, index: usize, value: u8) !void { - if (index >= self.memlimit) { - return error.CorruptInput; - } - try self.buf.ensureTotalCapacity(allocator, index + 1); - while (self.buf.items.len < index) { - self.buf.appendAssumeCapacity(0); - } - self.buf.appendAssumeCapacity(value); - } - - /// Retrieve the last byte or return a default - pub fn lastOr(self: Self, lit: u8) u8 { - return if (self.len == 0) - lit - else - self.get((self.dict_size + self.cursor - 1) % self.dict_size); - } - - /// Retrieve the n-th last byte - pub fn lastN(self: Self, dist: usize) !u8 { - if (dist > self.dict_size or dist > self.len) { - return error.CorruptInput; - } - - const offset = (self.dict_size + self.cursor - dist) % self.dict_size; - return self.get(offset); - } - - /// Append a literal - pub fn appendLiteral( - self: *Self, - allocator: Allocator, - lit: u8, - writer: anytype, - ) !void { - try self.set(allocator, self.cursor, lit); - self.cursor += 1; - self.len += 1; - - // Flush the circular buffer to the output - if (self.cursor == self.dict_size) { - try writer.writeAll(self.buf.items); - self.cursor = 0; - } - } - - /// Fetch an LZ sequence (length, distance) from inside the buffer - pub fn appendLz( - self: *Self, - allocator: Allocator, - len: usize, - dist: usize, - writer: anytype, - ) !void { - if (dist > self.dict_size or dist > self.len) { - return error.CorruptInput; - } - - var offset = (self.dict_size + self.cursor - dist) % self.dict_size; - var i: usize = 0; - while (i < len) : (i += 1) { - const x = self.get(offset); - try self.appendLiteral(allocator, x, writer); - offset += 1; - if (offset == self.dict_size) { - offset = 0; - } - } - } - - pub fn finish(self: *Self, writer: anytype) !void { - if (self.cursor > 0) { - try writer.writeAll(self.buf.items[0..self.cursor]); - self.cursor = 0; - } - } - - pub fn deinit(self: *Self, allocator: Allocator) void { - self.buf.deinit(allocator); - self.* = undefined; - } -}; diff --git a/lib/std/compress/lzma/decode/rangecoder.zig b/lib/std/compress/lzma/decode/rangecoder.zig deleted file mode 100644 index 01930884d7..0000000000 --- a/lib/std/compress/lzma/decode/rangecoder.zig +++ /dev/null @@ -1,181 +0,0 @@ -const std = @import("../../../std.zig"); -const mem = std.mem; - -pub const RangeDecoder = struct { - range: u32, - code: u32, - - pub fn init(reader: anytype) !RangeDecoder { - const reserved = try reader.readByte(); - if (reserved != 0) { - return error.CorruptInput; - } - return RangeDecoder{ - .range = 0xFFFF_FFFF, - .code = try reader.readInt(u32, .big), - }; - } - - pub fn fromParts( - range: u32, - code: u32, - ) RangeDecoder { - return .{ - .range = range, - .code = code, - }; - } - - pub fn set(self: *RangeDecoder, range: u32, code: u32) void { - self.range = range; - self.code = code; - } - - pub inline fn isFinished(self: RangeDecoder) bool { - return self.code == 0; - } - - inline fn normalize(self: *RangeDecoder, reader: anytype) !void { - if (self.range < 0x0100_0000) { - self.range <<= 8; - self.code = (self.code << 8) ^ @as(u32, try reader.readByte()); - } - } - - inline fn getBit(self: *RangeDecoder, reader: anytype) !bool { - self.range >>= 1; - - const bit = self.code >= self.range; - if (bit) - self.code -= self.range; - - try self.normalize(reader); - return bit; - } - - pub fn get(self: *RangeDecoder, reader: anytype, count: usize) !u32 { - var result: u32 = 0; - var i: usize = 0; - while (i < count) : (i += 1) - result = (result << 1) ^ @intFromBool(try self.getBit(reader)); - return result; - } - - pub inline fn decodeBit(self: *RangeDecoder, reader: anytype, prob: *u16, update: bool) !bool { - const bound = (self.range >> 11) * prob.*; - - if (self.code < bound) { - if (update) - prob.* += (0x800 - prob.*) >> 5; - self.range = bound; - - try self.normalize(reader); - return false; - } else { - if (update) - prob.* -= prob.* >> 5; - self.code -= bound; - self.range -= bound; - - try self.normalize(reader); - return true; - } - } - - fn parseBitTree( - self: *RangeDecoder, - reader: anytype, - num_bits: u5, - probs: []u16, - update: bool, - ) !u32 { - var tmp: u32 = 1; - var i: @TypeOf(num_bits) = 0; - while (i < num_bits) : (i += 1) { - const bit = try self.decodeBit(reader, &probs[tmp], update); - tmp = (tmp << 1) ^ @intFromBool(bit); - } - return tmp - (@as(u32, 1) << num_bits); - } - - pub fn parseReverseBitTree( - self: *RangeDecoder, - reader: anytype, - num_bits: u5, - probs: []u16, - offset: usize, - update: bool, - ) !u32 { - var result: u32 = 0; - var tmp: usize = 1; - var i: @TypeOf(num_bits) = 0; - while (i < num_bits) : (i += 1) { - const bit = @intFromBool(try self.decodeBit(reader, &probs[offset + tmp], update)); - tmp = (tmp << 1) ^ bit; - result ^= @as(u32, bit) << i; - } - return result; - } -}; - -pub fn BitTree(comptime num_bits: usize) type { - return struct { - probs: [1 << num_bits]u16 = @splat(0x400), - - const Self = @This(); - - pub fn parse( - self: *Self, - reader: anytype, - decoder: *RangeDecoder, - update: bool, - ) !u32 { - return decoder.parseBitTree(reader, num_bits, &self.probs, update); - } - - pub fn parseReverse( - self: *Self, - reader: anytype, - decoder: *RangeDecoder, - update: bool, - ) !u32 { - return decoder.parseReverseBitTree(reader, num_bits, &self.probs, 0, update); - } - - pub fn reset(self: *Self) void { - @memset(&self.probs, 0x400); - } - }; -} - -pub const LenDecoder = struct { - choice: u16 = 0x400, - choice2: u16 = 0x400, - low_coder: [16]BitTree(3) = @splat(.{}), - mid_coder: [16]BitTree(3) = @splat(.{}), - high_coder: BitTree(8) = .{}, - - pub fn decode( - self: *LenDecoder, - reader: anytype, - decoder: *RangeDecoder, - pos_state: usize, - update: bool, - ) !usize { - if (!try decoder.decodeBit(reader, &self.choice, update)) { - return @as(usize, try self.low_coder[pos_state].parse(reader, decoder, update)); - } else if (!try decoder.decodeBit(reader, &self.choice2, update)) { - return @as(usize, try self.mid_coder[pos_state].parse(reader, decoder, update)) + 8; - } else { - return @as(usize, try self.high_coder.parse(reader, decoder, update)) + 16; - } - } - - pub fn reset(self: *LenDecoder) void { - self.choice = 0x400; - self.choice2 = 0x400; - for (&self.low_coder) |*t| t.reset(); - for (&self.mid_coder) |*t| t.reset(); - self.high_coder.reset(); - } -}; diff --git a/lib/std/compress/lzma/vec2d.zig b/lib/std/compress/lzma/vec2d.zig deleted file mode 100644 index df61093b85..0000000000 --- a/lib/std/compress/lzma/vec2d.zig +++ /dev/null @@ -1,128 +0,0 @@ -const std = @import("../../std.zig"); -const math = std.math; -const mem = std.mem; -const Allocator = std.mem.Allocator; - -pub fn Vec2D(comptime T: type) type { - return struct { - data: []T, - cols: usize, - - const Self = @This(); - - pub fn init(allocator: Allocator, value: T, size: struct { usize, usize }) !Self { - const len = try math.mul(usize, size[0], size[1]); - const data = try allocator.alloc(T, len); - @memset(data, value); - return Self{ - .data = data, - .cols = size[1], - }; - } - - pub fn deinit(self: *Self, allocator: Allocator) void { - allocator.free(self.data); - self.* = undefined; - } - - pub fn fill(self: *Self, value: T) void { - @memset(self.data, value); - } - - inline fn _get(self: Self, row: usize) ![]T { - const start_row = try math.mul(usize, row, self.cols); - const end_row = try math.add(usize, start_row, self.cols); - return self.data[start_row..end_row]; - } - - pub fn get(self: Self, row: usize) ![]const T { - return self._get(row); - } - - pub fn getMut(self: *Self, row: usize) ![]T { - return self._get(row); - } - }; -} - -const testing = std.testing; -const expectEqualSlices = std.testing.expectEqualSlices; -const expectError = std.testing.expectError; - -test "init" { - const allocator = testing.allocator; - var vec2d = try Vec2D(i32).init(allocator, 1, .{ 2, 3 }); - defer vec2d.deinit(allocator); - - try expectEqualSlices(i32, &.{ 1, 1, 1 }, try vec2d.get(0)); - try expectEqualSlices(i32, &.{ 1, 1, 1 }, try vec2d.get(1)); -} - -test "init overflow" { - const allocator = testing.allocator; - try expectError( - error.Overflow, - Vec2D(i32).init(allocator, 1, .{ math.maxInt(usize), math.maxInt(usize) }), - ); -} - -test "fill" { - const allocator = testing.allocator; - var vec2d = try Vec2D(i32).init(allocator, 0, .{ 2, 3 }); - defer vec2d.deinit(allocator); - - vec2d.fill(7); - - try expectEqualSlices(i32, &.{ 7, 7, 7 }, try vec2d.get(0)); - try expectEqualSlices(i32, &.{ 7, 7, 7 }, try vec2d.get(1)); -} - -test "get" { - var data = [_]i32{ 0, 1, 2, 3, 4, 5, 6, 7 }; - const vec2d = Vec2D(i32){ - .data = &data, - .cols = 2, - }; - - try expectEqualSlices(i32, &.{ 0, 1 }, try vec2d.get(0)); - try expectEqualSlices(i32, &.{ 2, 3 }, try vec2d.get(1)); - try expectEqualSlices(i32, &.{ 4, 5 }, try vec2d.get(2)); - try expectEqualSlices(i32, &.{ 6, 7 }, try vec2d.get(3)); -} - -test "getMut" { - var data = [_]i32{ 0, 1, 2, 3, 4, 5, 6, 7 }; - var vec2d = Vec2D(i32){ - .data = &data, - .cols = 2, - }; - - const row = try vec2d.getMut(1); - row[1] = 9; - - try expectEqualSlices(i32, &.{ 0, 1 }, try vec2d.get(0)); - // (1, 1) should be 9. - try expectEqualSlices(i32, &.{ 2, 9 }, try vec2d.get(1)); - try expectEqualSlices(i32, &.{ 4, 5 }, try vec2d.get(2)); - try expectEqualSlices(i32, &.{ 6, 7 }, try vec2d.get(3)); -} - -test "get multiplication overflow" { - const allocator = testing.allocator; - var matrix = try Vec2D(i32).init(allocator, 0, .{ 3, 4 }); - defer matrix.deinit(allocator); - - const row = (math.maxInt(usize) / 4) + 1; - try expectError(error.Overflow, matrix.get(row)); - try expectError(error.Overflow, matrix.getMut(row)); -} - -test "get addition overflow" { - const allocator = testing.allocator; - var matrix = try Vec2D(i32).init(allocator, 0, .{ 3, 5 }); - defer matrix.deinit(allocator); - - const row = math.maxInt(usize) / 5; - try expectError(error.Overflow, matrix.get(row)); - try expectError(error.Overflow, matrix.getMut(row)); -} diff --git a/lib/std/compress/lzma2.zig b/lib/std/compress/lzma2.zig index 1ccfe36584..e454e6382e 100644 --- a/lib/std/compress/lzma2.zig +++ b/lib/std/compress/lzma2.zig @@ -1,14 +1,278 @@ const std = @import("../std.zig"); const Allocator = std.mem.Allocator; +const ArrayList = std.ArrayList; +const lzma = std.compress.lzma; -pub const decode = @import("lzma2/decode.zig"); +/// An accumulating buffer for LZ sequences +pub const LzAccumBuffer = struct { + /// Buffer + buf: ArrayList(u8), + + /// Buffer memory limit + memlimit: usize, + + /// Total number of bytes sent through the buffer + len: usize, + + const Self = @This(); + + pub fn init(memlimit: usize) Self { + return Self{ + .buf = .{}, + .memlimit = memlimit, + .len = 0, + }; + } + + pub fn appendByte(self: *Self, allocator: Allocator, byte: u8) !void { + try self.buf.append(allocator, byte); + self.len += 1; + } + + /// Reset the internal dictionary + pub fn reset(self: *Self, writer: anytype) !void { + try writer.writeAll(self.buf.items); + self.buf.clearRetainingCapacity(); + self.len = 0; + } + + /// Retrieve the last byte or return a default + pub fn lastOr(self: Self, lit: u8) u8 { + const buf_len = self.buf.items.len; + return if (buf_len == 0) + lit + else + self.buf.items[buf_len - 1]; + } + + /// Retrieve the n-th last byte + pub fn lastN(self: Self, dist: usize) !u8 { + const buf_len = self.buf.items.len; + if (dist > buf_len) { + return error.CorruptInput; + } + + return self.buf.items[buf_len - dist]; + } + + /// Append a literal + pub fn appendLiteral( + self: *Self, + allocator: Allocator, + lit: u8, + writer: anytype, + ) !void { + _ = writer; + if (self.len >= self.memlimit) { + return error.CorruptInput; + } + try self.buf.append(allocator, lit); + self.len += 1; + } + + /// Fetch an LZ sequence (length, distance) from inside the buffer + pub fn appendLz( + self: *Self, + allocator: Allocator, + len: usize, + dist: usize, + writer: anytype, + ) !void { + _ = writer; + + const buf_len = self.buf.items.len; + if (dist > buf_len) { + return error.CorruptInput; + } + + var offset = buf_len - dist; + var i: usize = 0; + while (i < len) : (i += 1) { + const x = self.buf.items[offset]; + try self.buf.append(allocator, x); + offset += 1; + } + self.len += len; + } + + pub fn finish(self: *Self, writer: anytype) !void { + try writer.writeAll(self.buf.items); + self.buf.clearRetainingCapacity(); + } + + pub fn deinit(self: *Self, allocator: Allocator) void { + self.buf.deinit(allocator); + self.* = undefined; + } +}; + +pub const Decode = struct { + lzma_state: lzma.Decode, + + pub fn init(allocator: Allocator) !Decode { + return Decode{ + .lzma_state = try lzma.Decode.init( + allocator, + .{ + .lc = 0, + .lp = 0, + .pb = 0, + }, + null, + ), + }; + } + + pub fn deinit(self: *Decode, allocator: Allocator) void { + self.lzma_state.deinit(allocator); + self.* = undefined; + } + + pub fn decompress( + self: *Decode, + allocator: Allocator, + reader: anytype, + writer: anytype, + ) !void { + var accum = LzAccumBuffer.init(std.math.maxInt(usize)); + defer accum.deinit(allocator); + + while (true) { + const status = try reader.readByte(); + + switch (status) { + 0 => break, + 1 => try parseUncompressed(allocator, reader, writer, &accum, true), + 2 => try parseUncompressed(allocator, reader, writer, &accum, false), + else => try self.parseLzma(allocator, reader, writer, &accum, status), + } + } + + try accum.finish(writer); + } + + fn parseLzma( + self: *Decode, + allocator: Allocator, + reader: anytype, + writer: anytype, + accum: *LzAccumBuffer, + status: u8, + ) !void { + if (status & 0x80 == 0) { + return error.CorruptInput; + } + + const Reset = struct { + dict: bool, + state: bool, + props: bool, + }; + + const reset = switch ((status >> 5) & 0x3) { + 0 => Reset{ + .dict = false, + .state = false, + .props = false, + }, + 1 => Reset{ + .dict = false, + .state = true, + .props = false, + }, + 2 => Reset{ + .dict = false, + .state = true, + .props = true, + }, + 3 => Reset{ + .dict = true, + .state = true, + .props = true, + }, + else => unreachable, + }; + + const unpacked_size = blk: { + var tmp: u64 = status & 0x1F; + tmp <<= 16; + tmp |= try reader.readInt(u16, .big); + break :blk tmp + 1; + }; + + const packed_size = blk: { + const tmp: u17 = try reader.readInt(u16, .big); + break :blk tmp + 1; + }; + + if (reset.dict) { + try accum.reset(writer); + } + + if (reset.state) { + var new_props = self.lzma_state.lzma_props; + + if (reset.props) { + var props = try reader.readByte(); + if (props >= 225) { + return error.CorruptInput; + } + + const lc = @as(u4, @intCast(props % 9)); + props /= 9; + const lp = @as(u3, @intCast(props % 5)); + props /= 5; + const pb = @as(u3, @intCast(props)); + + if (lc + lp > 4) { + return error.CorruptInput; + } + + new_props = .{ .lc = lc, .lp = lp, .pb = pb }; + } + + try self.lzma_state.resetState(allocator, new_props); + } + + self.lzma_state.unpacked_size = unpacked_size + accum.len; + + var counter = std.io.countingReader(reader); + const counter_reader = counter.reader(); + + var rangecoder = try lzma.RangeDecoder.init(counter_reader); + while (try self.lzma_state.process(allocator, counter_reader, writer, accum, &rangecoder) == .continue_) {} + + if (counter.bytes_read != packed_size) { + return error.CorruptInput; + } + } + + fn parseUncompressed( + allocator: Allocator, + reader: anytype, + writer: anytype, + accum: *LzAccumBuffer, + reset_dict: bool, + ) !void { + const unpacked_size = @as(u17, try reader.readInt(u16, .big)) + 1; + + if (reset_dict) { + try accum.reset(writer); + } + + var i: @TypeOf(unpacked_size) = 0; + while (i < unpacked_size) : (i += 1) { + try accum.appendByte(allocator, try reader.readByte()); + } + } +}; pub fn decompress( allocator: Allocator, reader: anytype, writer: anytype, ) !void { - var decoder = try decode.Decoder.init(allocator); + var decoder = try Decode.init(allocator); defer decoder.deinit(allocator); return decoder.decompress(allocator, reader, writer); } diff --git a/lib/std/compress/lzma2/decode.zig b/lib/std/compress/lzma2/decode.zig deleted file mode 100644 index 938c2d437b..0000000000 --- a/lib/std/compress/lzma2/decode.zig +++ /dev/null @@ -1,169 +0,0 @@ -const std = @import("../../std.zig"); -const Allocator = std.mem.Allocator; - -const lzma = @import("../lzma.zig"); -const DecoderState = lzma.decode.DecoderState; -const LzAccumBuffer = lzma.decode.lzbuffer.LzAccumBuffer; -const Properties = lzma.decode.Properties; -const RangeDecoder = lzma.decode.rangecoder.RangeDecoder; - -pub const Decoder = struct { - lzma_state: DecoderState, - - pub fn init(allocator: Allocator) !Decoder { - return Decoder{ - .lzma_state = try DecoderState.init( - allocator, - Properties{ - .lc = 0, - .lp = 0, - .pb = 0, - }, - null, - ), - }; - } - - pub fn deinit(self: *Decoder, allocator: Allocator) void { - self.lzma_state.deinit(allocator); - self.* = undefined; - } - - pub fn decompress( - self: *Decoder, - allocator: Allocator, - reader: anytype, - writer: anytype, - ) !void { - var accum = LzAccumBuffer.init(std.math.maxInt(usize)); - defer accum.deinit(allocator); - - while (true) { - const status = try reader.readByte(); - - switch (status) { - 0 => break, - 1 => try parseUncompressed(allocator, reader, writer, &accum, true), - 2 => try parseUncompressed(allocator, reader, writer, &accum, false), - else => try self.parseLzma(allocator, reader, writer, &accum, status), - } - } - - try accum.finish(writer); - } - - fn parseLzma( - self: *Decoder, - allocator: Allocator, - reader: anytype, - writer: anytype, - accum: *LzAccumBuffer, - status: u8, - ) !void { - if (status & 0x80 == 0) { - return error.CorruptInput; - } - - const Reset = struct { - dict: bool, - state: bool, - props: bool, - }; - - const reset = switch ((status >> 5) & 0x3) { - 0 => Reset{ - .dict = false, - .state = false, - .props = false, - }, - 1 => Reset{ - .dict = false, - .state = true, - .props = false, - }, - 2 => Reset{ - .dict = false, - .state = true, - .props = true, - }, - 3 => Reset{ - .dict = true, - .state = true, - .props = true, - }, - else => unreachable, - }; - - const unpacked_size = blk: { - var tmp: u64 = status & 0x1F; - tmp <<= 16; - tmp |= try reader.readInt(u16, .big); - break :blk tmp + 1; - }; - - const packed_size = blk: { - const tmp: u17 = try reader.readInt(u16, .big); - break :blk tmp + 1; - }; - - if (reset.dict) { - try accum.reset(writer); - } - - if (reset.state) { - var new_props = self.lzma_state.lzma_props; - - if (reset.props) { - var props = try reader.readByte(); - if (props >= 225) { - return error.CorruptInput; - } - - const lc = @as(u4, @intCast(props % 9)); - props /= 9; - const lp = @as(u3, @intCast(props % 5)); - props /= 5; - const pb = @as(u3, @intCast(props)); - - if (lc + lp > 4) { - return error.CorruptInput; - } - - new_props = Properties{ .lc = lc, .lp = lp, .pb = pb }; - } - - try self.lzma_state.resetState(allocator, new_props); - } - - self.lzma_state.unpacked_size = unpacked_size + accum.len; - - var counter = std.io.countingReader(reader); - const counter_reader = counter.reader(); - - var rangecoder = try RangeDecoder.init(counter_reader); - while (try self.lzma_state.process(allocator, counter_reader, writer, accum, &rangecoder) == .continue_) {} - - if (counter.bytes_read != packed_size) { - return error.CorruptInput; - } - } - - fn parseUncompressed( - allocator: Allocator, - reader: anytype, - writer: anytype, - accum: *LzAccumBuffer, - reset_dict: bool, - ) !void { - const unpacked_size = @as(u17, try reader.readInt(u16, .big)) + 1; - - if (reset_dict) { - try accum.reset(writer); - } - - var i: @TypeOf(unpacked_size) = 0; - while (i < unpacked_size) : (i += 1) { - try accum.appendByte(allocator, try reader.readByte()); - } - } -}; From 6464e0d4fc9937e154c34567891bae84c63732b9 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 22 Aug 2025 14:14:26 -0700 Subject: [PATCH 02/12] std.compress.xz: flatten namespaces --- lib/std/compress/xz.zig | 211 +++++++++++++++++++++++++++++++++- lib/std/compress/xz/block.zig | 208 --------------------------------- 2 files changed, 207 insertions(+), 212 deletions(-) delete mode 100644 lib/std/compress/xz/block.zig diff --git a/lib/std/compress/xz.zig b/lib/std/compress/xz.zig index 6c99e9f427..54a6a38506 100644 --- a/lib/std/compress/xz.zig +++ b/lib/std/compress/xz.zig @@ -1,7 +1,10 @@ const std = @import("std"); -const block = @import("xz/block.zig"); const Allocator = std.mem.Allocator; +const ArrayList = std.ArrayList; const Crc32 = std.hash.Crc32; +const Crc64 = std.hash.crc.Crc64Xz; +const Sha256 = std.crypto.hash.sha2.Sha256; +const lzma2 = std.compress.lzma2; pub const Check = enum(u4) { none = 0x00, @@ -27,11 +30,11 @@ pub fn Decompress(comptime ReaderType: type) type { return struct { const Self = @This(); - pub const Error = ReaderType.Error || block.Decoder(ReaderType).Error; + pub const Error = ReaderType.Error || Decoder(ReaderType).Error; pub const Reader = std.io.GenericReader(*Self, Error, read); allocator: Allocator, - block_decoder: block.Decoder(ReaderType), + block_decoder: Decoder(ReaderType), in_reader: ReaderType, fn init(allocator: Allocator, source: ReaderType) !Self { @@ -52,7 +55,7 @@ pub fn Decompress(comptime ReaderType: type) type { return Self{ .allocator = allocator, - .block_decoder = try block.decoder(allocator, source, check), + .block_decoder = try decoder(allocator, source, check), .in_reader = source, }; } @@ -161,6 +164,206 @@ pub fn hashedReader( return .{ .child_reader = reader, .hasher = hasher }; } +const DecodeError = error{ + CorruptInput, + EndOfStream, + EndOfStreamWithNoError, + WrongChecksum, + Unsupported, + Overflow, +}; + +pub fn decoder(allocator: Allocator, reader: anytype, check: Check) !Decoder(@TypeOf(reader)) { + return Decoder(@TypeOf(reader)).init(allocator, reader, check); +} + +pub fn Decoder(comptime ReaderType: type) type { + return struct { + const Self = @This(); + pub const Error = + ReaderType.Error || + DecodeError || + Allocator.Error; + pub const Reader = std.io.GenericReader(*Self, Error, read); + + allocator: Allocator, + inner_reader: ReaderType, + check: Check, + err: ?Error, + to_read: ArrayList(u8), + read_pos: usize, + block_count: usize, + + fn init(allocator: Allocator, in_reader: ReaderType, check: Check) !Self { + return Self{ + .allocator = allocator, + .inner_reader = in_reader, + .check = check, + .err = null, + .to_read = .{}, + .read_pos = 0, + .block_count = 0, + }; + } + + pub fn deinit(self: *Self) void { + self.to_read.deinit(self.allocator); + } + + pub fn reader(self: *Self) Reader { + return .{ .context = self }; + } + + pub fn read(self: *Self, output: []u8) Error!usize { + while (true) { + const unread_len = self.to_read.items.len - self.read_pos; + if (unread_len > 0) { + const n = @min(unread_len, output.len); + @memcpy(output[0..n], self.to_read.items[self.read_pos..][0..n]); + self.read_pos += n; + return n; + } + if (self.err) |e| { + if (e == DecodeError.EndOfStreamWithNoError) { + return 0; + } + return e; + } + if (self.read_pos > 0) { + self.to_read.shrinkRetainingCapacity(0); + self.read_pos = 0; + } + self.readBlock() catch |e| { + self.err = e; + }; + } + } + + fn readBlock(self: *Self) Error!void { + var block_counter = std.io.countingReader(self.inner_reader); + const block_reader = block_counter.reader(); + + var packed_size: ?u64 = null; + var unpacked_size: ?u64 = null; + + // Block Header + { + var header_hasher = hashedReader(block_reader, Crc32.init()); + const header_reader = header_hasher.reader(); + + const header_size = @as(u64, try header_reader.readByte()) * 4; + if (header_size == 0) + return error.EndOfStreamWithNoError; + + const Flags = packed struct(u8) { + last_filter_index: u2, + reserved: u4, + has_packed_size: bool, + has_unpacked_size: bool, + }; + + const flags = @as(Flags, @bitCast(try header_reader.readByte())); + const filter_count = @as(u3, flags.last_filter_index) + 1; + if (filter_count > 1) + return error.Unsupported; + + if (flags.has_packed_size) + packed_size = try std.leb.readUleb128(u64, header_reader); + + if (flags.has_unpacked_size) + unpacked_size = try std.leb.readUleb128(u64, header_reader); + + const FilterId = enum(u64) { + lzma2 = 0x21, + _, + }; + + const filter_id = @as( + FilterId, + @enumFromInt(try std.leb.readUleb128(u64, header_reader)), + ); + + if (@intFromEnum(filter_id) >= 0x4000_0000_0000_0000) + return error.CorruptInput; + + if (filter_id != .lzma2) + return error.Unsupported; + + const properties_size = try std.leb.readUleb128(u64, header_reader); + if (properties_size != 1) + return error.CorruptInput; + + // TODO: use filter properties + _ = try header_reader.readByte(); + + while (block_counter.bytes_read != header_size) { + if (try header_reader.readByte() != 0) + return error.CorruptInput; + } + + const hash_a = header_hasher.hasher.final(); + const hash_b = try header_reader.readInt(u32, .little); + if (hash_a != hash_b) + return error.WrongChecksum; + } + + // Compressed Data + var packed_counter = std.io.countingReader(block_reader); + try lzma2.decompress( + self.allocator, + packed_counter.reader(), + self.to_read.writer(self.allocator), + ); + + if (packed_size) |s| { + if (s != packed_counter.bytes_read) + return error.CorruptInput; + } + + const unpacked_bytes = self.to_read.items; + if (unpacked_size) |s| { + if (s != unpacked_bytes.len) + return error.CorruptInput; + } + + // Block Padding + while (block_counter.bytes_read % 4 != 0) { + if (try block_reader.readByte() != 0) + return error.CorruptInput; + } + + switch (self.check) { + .none => {}, + .crc32 => { + const hash_a = Crc32.hash(unpacked_bytes); + const hash_b = try self.inner_reader.readInt(u32, .little); + if (hash_a != hash_b) + return error.WrongChecksum; + }, + .crc64 => { + const hash_a = Crc64.hash(unpacked_bytes); + const hash_b = try self.inner_reader.readInt(u64, .little); + if (hash_a != hash_b) + return error.WrongChecksum; + }, + .sha256 => { + var hash_a: [Sha256.digest_length]u8 = undefined; + Sha256.hash(unpacked_bytes, &hash_a, .{}); + + var hash_b: [Sha256.digest_length]u8 = undefined; + try self.inner_reader.readNoEof(&hash_b); + + if (!std.mem.eql(u8, &hash_a, &hash_b)) + return error.WrongChecksum; + }, + else => return error.Unsupported, + } + + self.block_count += 1; + } + }; +} + test { _ = @import("xz/test.zig"); } diff --git a/lib/std/compress/xz/block.zig b/lib/std/compress/xz/block.zig deleted file mode 100644 index 505dc543a8..0000000000 --- a/lib/std/compress/xz/block.zig +++ /dev/null @@ -1,208 +0,0 @@ -const std = @import("../../std.zig"); -const lzma2 = std.compress.lzma2; -const Allocator = std.mem.Allocator; -const ArrayListUnmanaged = std.ArrayListUnmanaged; -const Crc32 = std.hash.Crc32; -const Crc64 = std.hash.crc.Crc64Xz; -const Sha256 = std.crypto.hash.sha2.Sha256; -const xz = std.compress.xz; - -const DecodeError = error{ - CorruptInput, - EndOfStream, - EndOfStreamWithNoError, - WrongChecksum, - Unsupported, - Overflow, -}; - -pub fn decoder(allocator: Allocator, reader: anytype, check: xz.Check) !Decoder(@TypeOf(reader)) { - return Decoder(@TypeOf(reader)).init(allocator, reader, check); -} - -pub fn Decoder(comptime ReaderType: type) type { - return struct { - const Self = @This(); - pub const Error = - ReaderType.Error || - DecodeError || - Allocator.Error; - pub const Reader = std.io.GenericReader(*Self, Error, read); - - allocator: Allocator, - inner_reader: ReaderType, - check: xz.Check, - err: ?Error, - to_read: ArrayListUnmanaged(u8), - read_pos: usize, - block_count: usize, - - fn init(allocator: Allocator, in_reader: ReaderType, check: xz.Check) !Self { - return Self{ - .allocator = allocator, - .inner_reader = in_reader, - .check = check, - .err = null, - .to_read = .{}, - .read_pos = 0, - .block_count = 0, - }; - } - - pub fn deinit(self: *Self) void { - self.to_read.deinit(self.allocator); - } - - pub fn reader(self: *Self) Reader { - return .{ .context = self }; - } - - pub fn read(self: *Self, output: []u8) Error!usize { - while (true) { - const unread_len = self.to_read.items.len - self.read_pos; - if (unread_len > 0) { - const n = @min(unread_len, output.len); - @memcpy(output[0..n], self.to_read.items[self.read_pos..][0..n]); - self.read_pos += n; - return n; - } - if (self.err) |e| { - if (e == DecodeError.EndOfStreamWithNoError) { - return 0; - } - return e; - } - if (self.read_pos > 0) { - self.to_read.shrinkRetainingCapacity(0); - self.read_pos = 0; - } - self.readBlock() catch |e| { - self.err = e; - }; - } - } - - fn readBlock(self: *Self) Error!void { - var block_counter = std.io.countingReader(self.inner_reader); - const block_reader = block_counter.reader(); - - var packed_size: ?u64 = null; - var unpacked_size: ?u64 = null; - - // Block Header - { - var header_hasher = xz.hashedReader(block_reader, Crc32.init()); - const header_reader = header_hasher.reader(); - - const header_size = @as(u64, try header_reader.readByte()) * 4; - if (header_size == 0) - return error.EndOfStreamWithNoError; - - const Flags = packed struct(u8) { - last_filter_index: u2, - reserved: u4, - has_packed_size: bool, - has_unpacked_size: bool, - }; - - const flags = @as(Flags, @bitCast(try header_reader.readByte())); - const filter_count = @as(u3, flags.last_filter_index) + 1; - if (filter_count > 1) - return error.Unsupported; - - if (flags.has_packed_size) - packed_size = try std.leb.readUleb128(u64, header_reader); - - if (flags.has_unpacked_size) - unpacked_size = try std.leb.readUleb128(u64, header_reader); - - const FilterId = enum(u64) { - lzma2 = 0x21, - _, - }; - - const filter_id = @as( - FilterId, - @enumFromInt(try std.leb.readUleb128(u64, header_reader)), - ); - - if (@intFromEnum(filter_id) >= 0x4000_0000_0000_0000) - return error.CorruptInput; - - if (filter_id != .lzma2) - return error.Unsupported; - - const properties_size = try std.leb.readUleb128(u64, header_reader); - if (properties_size != 1) - return error.CorruptInput; - - // TODO: use filter properties - _ = try header_reader.readByte(); - - while (block_counter.bytes_read != header_size) { - if (try header_reader.readByte() != 0) - return error.CorruptInput; - } - - const hash_a = header_hasher.hasher.final(); - const hash_b = try header_reader.readInt(u32, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - } - - // Compressed Data - var packed_counter = std.io.countingReader(block_reader); - try lzma2.decompress( - self.allocator, - packed_counter.reader(), - self.to_read.writer(self.allocator), - ); - - if (packed_size) |s| { - if (s != packed_counter.bytes_read) - return error.CorruptInput; - } - - const unpacked_bytes = self.to_read.items; - if (unpacked_size) |s| { - if (s != unpacked_bytes.len) - return error.CorruptInput; - } - - // Block Padding - while (block_counter.bytes_read % 4 != 0) { - if (try block_reader.readByte() != 0) - return error.CorruptInput; - } - - switch (self.check) { - .none => {}, - .crc32 => { - const hash_a = Crc32.hash(unpacked_bytes); - const hash_b = try self.inner_reader.readInt(u32, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - }, - .crc64 => { - const hash_a = Crc64.hash(unpacked_bytes); - const hash_b = try self.inner_reader.readInt(u64, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - }, - .sha256 => { - var hash_a: [Sha256.digest_length]u8 = undefined; - Sha256.hash(unpacked_bytes, &hash_a, .{}); - - var hash_b: [Sha256.digest_length]u8 = undefined; - try self.inner_reader.readNoEof(&hash_b); - - if (!std.mem.eql(u8, &hash_a, &hash_b)) - return error.WrongChecksum; - }, - else => return error.Unsupported, - } - - self.block_count += 1; - } - }; -} From 58e60697e2930f4311ae9e744ae1c2877e0b69ed Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 25 Aug 2025 18:03:48 -0700 Subject: [PATCH 03/12] std.compress.lzma: update for new I/O API --- lib/std/compress/lzma.zig | 456 ++++++++++++++--------------- lib/std/compress/lzma/test.zig | 26 +- lib/std/compress/lzma2.zig | 83 +++--- lib/std/compress/xz.zig | 366 +---------------------- lib/std/compress/xz/Decompress.zig | 288 ++++++++++++++++++ 5 files changed, 561 insertions(+), 658 deletions(-) create mode 100644 lib/std/compress/xz/Decompress.zig diff --git a/lib/std/compress/lzma.zig b/lib/std/compress/lzma.zig index c40eeeb56e..597e97a94f 100644 --- a/lib/std/compress/lzma.zig +++ b/lib/std/compress/lzma.zig @@ -4,49 +4,34 @@ const mem = std.mem; const Allocator = std.mem.Allocator; const assert = std.debug.assert; const ArrayList = std.ArrayList; +const Writer = std.Io.Writer; +const Reader = std.Io.Reader; pub const RangeDecoder = struct { range: u32, code: u32, - pub fn init(reader: anytype) !RangeDecoder { - const reserved = try reader.readByte(); - if (reserved != 0) { - return error.CorruptInput; - } - return RangeDecoder{ - .range = 0xFFFF_FFFF, - .code = try reader.readInt(u32, .big), - }; - } - - pub fn fromParts( - range: u32, - code: u32, - ) RangeDecoder { + pub fn init(reader: *Reader) !RangeDecoder { + const reserved = try reader.takeByte(); + if (reserved != 0) return error.InvalidRangeCode; return .{ - .range = range, - .code = code, + .range = 0xFFFF_FFFF, + .code = try reader.takeInt(u32, .big), }; } - pub fn set(self: *RangeDecoder, range: u32, code: u32) void { - self.range = range; - self.code = code; - } - - pub inline fn isFinished(self: RangeDecoder) bool { + pub fn isFinished(self: RangeDecoder) bool { return self.code == 0; } - inline fn normalize(self: *RangeDecoder, reader: anytype) !void { + fn normalize(self: *RangeDecoder, reader: *Reader) !void { if (self.range < 0x0100_0000) { self.range <<= 8; - self.code = (self.code << 8) ^ @as(u32, try reader.readByte()); + self.code = (self.code << 8) ^ @as(u32, try reader.takeByte()); } } - inline fn getBit(self: *RangeDecoder, reader: anytype) !bool { + fn getBit(self: *RangeDecoder, reader: *Reader) !bool { self.range >>= 1; const bit = self.code >= self.range; @@ -57,7 +42,7 @@ pub const RangeDecoder = struct { return bit; } - pub fn get(self: *RangeDecoder, reader: anytype, count: usize) !u32 { + pub fn get(self: *RangeDecoder, reader: *Reader, count: usize) !u32 { var result: u32 = 0; var i: usize = 0; while (i < count) : (i += 1) @@ -65,7 +50,7 @@ pub const RangeDecoder = struct { return result; } - pub inline fn decodeBit(self: *RangeDecoder, reader: anytype, prob: *u16, update: bool) !bool { + pub fn decodeBit(self: *RangeDecoder, reader: *Reader, prob: *u16, update: bool) !bool { const bound = (self.range >> 11) * prob.*; if (self.code < bound) { @@ -88,7 +73,7 @@ pub const RangeDecoder = struct { fn parseBitTree( self: *RangeDecoder, - reader: anytype, + reader: *Reader, num_bits: u5, probs: []u16, update: bool, @@ -104,7 +89,7 @@ pub const RangeDecoder = struct { pub fn parseReverseBitTree( self: *RangeDecoder, - reader: anytype, + reader: *Reader, num_bits: u5, probs: []u16, offset: usize, @@ -123,7 +108,7 @@ pub const RangeDecoder = struct { }; pub const Decode = struct { - lzma_props: Properties, + properties: Properties, unpacked_size: ?u64, literal_probs: Vec2d, pos_slot_decoder: [4]BitTree(6), @@ -141,14 +126,14 @@ pub const Decode = struct { rep_len_decoder: LenDecoder, pub fn init( - allocator: Allocator, - lzma_props: Properties, + gpa: Allocator, + properties: Properties, unpacked_size: ?u64, ) !Decode { return .{ - .lzma_props = lzma_props, + .properties = properties, .unpacked_size = unpacked_size, - .literal_probs = try Vec2d.init(allocator, 0x400, .{ @as(usize, 1) << (lzma_props.lc + lzma_props.lp), 0x300 }), + .literal_probs = try Vec2d.init(gpa, 0x400, .{ @as(usize, 1) << (properties.lc + properties.lp), 0x300 }), .pos_slot_decoder = @splat(.{}), .align_decoder = .{}, .pos_decoders = @splat(0x400), @@ -165,21 +150,21 @@ pub const Decode = struct { }; } - pub fn deinit(self: *Decode, allocator: Allocator) void { - self.literal_probs.deinit(allocator); + pub fn deinit(self: *Decode, gpa: Allocator) void { + self.literal_probs.deinit(gpa); self.* = undefined; } - pub fn resetState(self: *Decode, allocator: Allocator, new_props: Properties) !void { + pub fn resetState(self: *Decode, gpa: Allocator, new_props: Properties) !void { new_props.validate(); - if (self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp) { + if (self.properties.lc + self.properties.lp == new_props.lc + new_props.lp) { self.literal_probs.fill(0x400); } else { - self.literal_probs.deinit(allocator); - self.literal_probs = try Vec2d.init(allocator, 0x400, .{ @as(usize, 1) << (new_props.lc + new_props.lp), 0x300 }); + self.literal_probs.deinit(gpa); + self.literal_probs = try Vec2d.init(gpa, 0x400, .{ @as(usize, 1) << (new_props.lc + new_props.lp), 0x300 }); } - self.lzma_props = new_props; + self.properties = new_props; for (&self.pos_slot_decoder) |*t| t.reset(); self.align_decoder.reset(); self.pos_decoders = @splat(0x400); @@ -195,26 +180,23 @@ pub const Decode = struct { self.rep_len_decoder.reset(); } - fn processNextInner( + fn processNext( self: *Decode, - allocator: Allocator, - reader: anytype, - writer: anytype, - buffer: anytype, + reader: *Reader, + allocating: *Writer.Allocating, + buffer: *CircularBuffer, decoder: *RangeDecoder, update: bool, ) !ProcessingStatus { - const pos_state = buffer.len & ((@as(usize, 1) << self.lzma_props.pb) - 1); + const gpa = allocating.allocator; + const writer = &allocating.writer; + const pos_state = buffer.len & ((@as(usize, 1) << self.properties.pb) - 1); - if (!try decoder.decodeBit( - reader, - &self.is_match[(self.state << 4) + pos_state], - update, - )) { + if (!try decoder.decodeBit(reader, &self.is_match[(self.state << 4) + pos_state], update)) { const byte: u8 = try self.decodeLiteral(reader, buffer, decoder, update); if (update) { - try buffer.appendLiteral(allocator, byte, writer); + try buffer.appendLiteral(gpa, byte, writer); self.state = if (self.state < 4) 0 @@ -223,7 +205,7 @@ pub const Decode = struct { else self.state - 6; } - return .continue_; + return .more; } var len: usize = undefined; @@ -237,9 +219,9 @@ pub const Decode = struct { if (update) { self.state = if (self.state < 7) 9 else 11; const dist = self.rep[0] + 1; - try buffer.appendLz(allocator, 1, dist, writer); + try buffer.appendLz(gpa, 1, dist, writer); } - return .continue_; + return .more; } } else { const idx: usize = if (!try decoder.decodeBit(reader, &self.is_rep_g1[self.state], update)) @@ -293,31 +275,19 @@ pub const Decode = struct { len += 2; const dist = self.rep[0] + 1; - try buffer.appendLz(allocator, len, dist, writer); + try buffer.appendLz(gpa, len, dist, writer); } - return .continue_; - } - - fn processNext( - self: *Decode, - allocator: Allocator, - reader: anytype, - writer: anytype, - buffer: anytype, - decoder: *RangeDecoder, - ) !ProcessingStatus { - return self.processNextInner(allocator, reader, writer, buffer, decoder, true); + return .more; } pub fn process( self: *Decode, - allocator: Allocator, - reader: anytype, - writer: anytype, - buffer: anytype, + reader: *Reader, + allocating: *Writer.Allocating, + buffer: *CircularBuffer, decoder: *RangeDecoder, - ) !ProcessingStatus { + ) !void { process_next: { if (self.unpacked_size) |unpacked_size| { if (buffer.len >= unpacked_size) { @@ -326,26 +296,24 @@ pub const Decode = struct { } else if (decoder.isFinished()) { break :process_next; } - - switch (try self.processNext(allocator, reader, writer, buffer, decoder)) { - .continue_ => return .continue_, - .finished => break :process_next, + switch (try self.processNext(reader, allocating, buffer, decoder, true)) { + .more => return, + .finished => {}, } } if (self.unpacked_size) |unpacked_size| { - if (buffer.len != unpacked_size) { - return error.CorruptInput; - } + if (buffer.len != unpacked_size) return error.DecompressedSizeMismatch; } - return .finished; + try buffer.finish(&allocating.writer); + self.state = math.maxInt(usize); } fn decodeLiteral( self: *Decode, - reader: anytype, - buffer: anytype, + reader: *Reader, + buffer: *CircularBuffer, decoder: *RangeDecoder, update: bool, ) !u8 { @@ -353,9 +321,9 @@ pub const Decode = struct { const prev_byte = @as(usize, buffer.lastOr(def_prev_byte)); var result: usize = 1; - const lit_state = ((buffer.len & ((@as(usize, 1) << self.lzma_props.lp) - 1)) << self.lzma_props.lc) + - (prev_byte >> (8 - self.lzma_props.lc)); - const probs = try self.literal_probs.getMut(lit_state); + const lit_state = ((buffer.len & ((@as(usize, 1) << self.properties.lp) - 1)) << self.properties.lc) + + (prev_byte >> (8 - self.properties.lc)); + const probs = try self.literal_probs.get(lit_state); if (self.state >= 7) { var match_byte = @as(usize, try buffer.lastN(self.rep[0] + 1)); @@ -384,7 +352,7 @@ pub const Decode = struct { fn decodeDistance( self: *Decode, - reader: anytype, + reader: *Reader, decoder: *RangeDecoder, length: usize, update: bool, @@ -415,46 +383,40 @@ pub const Decode = struct { } /// A circular buffer for LZ sequences - pub const LzCircularBuffer = struct { + pub const CircularBuffer = struct { /// Circular buffer buf: ArrayList(u8), - /// Length of the buffer dict_size: usize, - /// Buffer memory limit - memlimit: usize, - + mem_limit: usize, /// Current position cursor: usize, - /// Total number of bytes sent through the buffer len: usize, - const Self = @This(); - - pub fn init(dict_size: usize, memlimit: usize) Self { - return Self{ + pub fn init(dict_size: usize, mem_limit: usize) CircularBuffer { + return .{ .buf = .{}, .dict_size = dict_size, - .memlimit = memlimit, + .mem_limit = mem_limit, .cursor = 0, .len = 0, }; } - pub fn get(self: Self, index: usize) u8 { + pub fn get(self: CircularBuffer, index: usize) u8 { return if (0 <= index and index < self.buf.items.len) self.buf.items[index] else 0; } - pub fn set(self: *Self, allocator: Allocator, index: usize, value: u8) !void { - if (index >= self.memlimit) { + pub fn set(self: *CircularBuffer, gpa: Allocator, index: usize, value: u8) !void { + if (index >= self.mem_limit) { return error.CorruptInput; } - try self.buf.ensureTotalCapacity(allocator, index + 1); + try self.buf.ensureTotalCapacity(gpa, index + 1); while (self.buf.items.len < index) { self.buf.appendAssumeCapacity(0); } @@ -462,7 +424,7 @@ pub const Decode = struct { } /// Retrieve the last byte or return a default - pub fn lastOr(self: Self, lit: u8) u8 { + pub fn lastOr(self: CircularBuffer, lit: u8) u8 { return if (self.len == 0) lit else @@ -470,7 +432,7 @@ pub const Decode = struct { } /// Retrieve the n-th last byte - pub fn lastN(self: Self, dist: usize) !u8 { + pub fn lastN(self: CircularBuffer, dist: usize) !u8 { if (dist > self.dict_size or dist > self.len) { return error.CorruptInput; } @@ -481,12 +443,12 @@ pub const Decode = struct { /// Append a literal pub fn appendLiteral( - self: *Self, - allocator: Allocator, + self: *CircularBuffer, + gpa: Allocator, lit: u8, - writer: anytype, + writer: *Writer, ) !void { - try self.set(allocator, self.cursor, lit); + try self.set(gpa, self.cursor, lit); self.cursor += 1; self.len += 1; @@ -499,11 +461,11 @@ pub const Decode = struct { /// Fetch an LZ sequence (length, distance) from inside the buffer pub fn appendLz( - self: *Self, - allocator: Allocator, + self: *CircularBuffer, + gpa: Allocator, len: usize, dist: usize, - writer: anytype, + writer: *Writer, ) !void { if (dist > self.dict_size or dist > self.len) { return error.CorruptInput; @@ -513,7 +475,7 @@ pub const Decode = struct { var i: usize = 0; while (i < len) : (i += 1) { const x = self.get(offset); - try self.appendLiteral(allocator, x, writer); + try self.appendLiteral(gpa, x, writer); offset += 1; if (offset == self.dict_size) { offset = 0; @@ -521,15 +483,15 @@ pub const Decode = struct { } } - pub fn finish(self: *Self, writer: anytype) !void { + pub fn finish(self: *CircularBuffer, writer: *Writer) !void { if (self.cursor > 0) { try writer.writeAll(self.buf.items[0..self.cursor]); self.cursor = 0; } } - pub fn deinit(self: *Self, allocator: Allocator) void { - self.buf.deinit(allocator); + pub fn deinit(self: *CircularBuffer, gpa: Allocator) void { + self.buf.deinit(gpa); self.* = undefined; } }; @@ -538,11 +500,9 @@ pub const Decode = struct { return struct { probs: [1 << num_bits]u16 = @splat(0x400), - const Self = @This(); - pub fn parse( - self: *Self, - reader: anytype, + self: *@This(), + reader: *Reader, decoder: *RangeDecoder, update: bool, ) !u32 { @@ -550,15 +510,15 @@ pub const Decode = struct { } pub fn parseReverse( - self: *Self, - reader: anytype, + self: *@This(), + reader: *Reader, decoder: *RangeDecoder, update: bool, ) !u32 { return decoder.parseReverseBitTree(reader, num_bits, &self.probs, 0, update); } - pub fn reset(self: *Self) void { + pub fn reset(self: *@This()) void { @memset(&self.probs, 0x400); } }; @@ -573,7 +533,7 @@ pub const Decode = struct { pub fn decode( self: *LenDecoder, - reader: anytype, + reader: *Reader, decoder: *RangeDecoder, pos_state: usize, update: bool, @@ -600,45 +560,35 @@ pub const Decode = struct { data: []u16, cols: usize, - const Self = @This(); - - pub fn init(allocator: Allocator, value: u16, size: struct { usize, usize }) !Self { + pub fn init(gpa: Allocator, value: u16, size: struct { usize, usize }) !Vec2d { const len = try math.mul(usize, size[0], size[1]); - const data = try allocator.alloc(u16, len); + const data = try gpa.alloc(u16, len); @memset(data, value); - return Self{ + return .{ .data = data, .cols = size[1], }; } - pub fn deinit(self: *Self, allocator: Allocator) void { - allocator.free(self.data); + pub fn deinit(self: *Vec2d, gpa: Allocator) void { + gpa.free(self.data); self.* = undefined; } - pub fn fill(self: *Self, value: u16) void { + pub fn fill(self: *Vec2d, value: u16) void { @memset(self.data, value); } - inline fn _get(self: Self, row: usize) ![]u16 { + fn get(self: Vec2d, row: usize) ![]u16 { const start_row = try math.mul(usize, row, self.cols); const end_row = try math.add(usize, start_row, self.cols); return self.data[start_row..end_row]; } - - pub fn get(self: Self, row: usize) ![]const u16 { - return self._get(row); - } - - pub fn getMut(self: *Self, row: usize) ![]u16 { - return self._get(row); - } }; pub const Options = struct { unpacked_size: UnpackedSize = .read_from_header, - memlimit: ?usize = null, + mem_limit: ?usize = null, allow_incomplete: bool = false, }; @@ -649,7 +599,7 @@ pub const Decode = struct { }; const ProcessingStatus = enum { - continue_, + more, finished, }; @@ -670,39 +620,34 @@ pub const Decode = struct { dict_size: u32, unpacked_size: ?u64, - pub fn readHeader(reader: anytype, options: Options) !Params { - var props = try reader.readByte(); - if (props >= 225) { - return error.CorruptInput; - } + pub fn readHeader(reader: *Reader, options: Options) !Params { + var props = try reader.takeByte(); + if (props >= 225) return error.CorruptInput; - const lc = @as(u4, @intCast(props % 9)); + const lc: u4 = @intCast(props % 9); props /= 9; - const lp = @as(u3, @intCast(props % 5)); + const lp: u3 = @intCast(props % 5); props /= 5; - const pb = @as(u3, @intCast(props)); + const pb: u3 = @intCast(props); - const dict_size_provided = try reader.readInt(u32, .little); + const dict_size_provided = try reader.takeInt(u32, .little); const dict_size = @max(0x1000, dict_size_provided); const unpacked_size = switch (options.unpacked_size) { .read_from_header => blk: { - const unpacked_size_provided = try reader.readInt(u64, .little); + const unpacked_size_provided = try reader.takeInt(u64, .little); const marker_mandatory = unpacked_size_provided == 0xFFFF_FFFF_FFFF_FFFF; - break :blk if (marker_mandatory) - null - else - unpacked_size_provided; + break :blk if (marker_mandatory) null else unpacked_size_provided; }, .read_header_but_use_provided => |x| blk: { - _ = try reader.readInt(u64, .little); + _ = try reader.takeInt(u64, .little); break :blk x; }, .use_provided => |x| x, }; - return Params{ - .properties = Properties{ .lc = lc, .lp = lp, .pb = pb }, + return .{ + .properties = .{ .lc = lc, .lp = lp, .pb = pb }, .dict_size = dict_size, .unpacked_size = unpacked_size, }; @@ -710,84 +655,121 @@ pub const Decode = struct { }; }; -pub fn decompress( - allocator: Allocator, - reader: anytype, -) !Decompress(@TypeOf(reader)) { - return decompressWithOptions(allocator, reader, .{}); -} +pub const Decompress = struct { + gpa: Allocator, + input: *Reader, + reader: Reader, + buffer: Decode.CircularBuffer, + range_decoder: RangeDecoder, + decode: Decode, + err: ?Error, -pub fn decompressWithOptions( - allocator: Allocator, - reader: anytype, - options: Decode.Options, -) !Decompress(@TypeOf(reader)) { - const params = try Decode.Params.readHeader(reader, options); - return Decompress(@TypeOf(reader)).init(allocator, reader, params, options.memlimit); -} - -pub fn Decompress(comptime ReaderType: type) type { - return struct { - const Self = @This(); - - pub const Error = - ReaderType.Error || - Allocator.Error || - error{ CorruptInput, EndOfStream, Overflow }; - - pub const Reader = std.io.GenericReader(*Self, Error, read); - - allocator: Allocator, - in_reader: ReaderType, - to_read: std.ArrayListUnmanaged(u8), - - buffer: Decode.LzCircularBuffer, - decoder: RangeDecoder, - state: Decode, - - pub fn init(allocator: Allocator, source: ReaderType, params: Decode.Params, memlimit: ?usize) !Self { - return Self{ - .allocator = allocator, - .in_reader = source, - .to_read = .{}, - - .buffer = Decode.LzCircularBuffer.init(params.dict_size, memlimit orelse math.maxInt(usize)), - .decoder = try RangeDecoder.init(source), - .state = try Decode.init(allocator, params.properties, params.unpacked_size), - }; - } - - pub fn reader(self: *Self) Reader { - return .{ .context = self }; - } - - pub fn deinit(self: *Self) void { - self.to_read.deinit(self.allocator); - self.buffer.deinit(self.allocator); - self.state.deinit(self.allocator); - self.* = undefined; - } - - pub fn read(self: *Self, output: []u8) Error!usize { - const writer = self.to_read.writer(self.allocator); - while (self.to_read.items.len < output.len) { - switch (try self.state.process(self.allocator, self.in_reader, writer, &self.buffer, &self.decoder)) { - .continue_ => {}, - .finished => { - try self.buffer.finish(writer); - break; - }, - } - } - const input = self.to_read.items; - const n = @min(input.len, output.len); - @memcpy(output[0..n], input[0..n]); - std.mem.copyForwards(u8, input[0 .. input.len - n], input[n..]); - self.to_read.shrinkRetainingCapacity(input.len - n); - return n; - } + pub const Error = error{ + OutOfMemory, + ReadFailed, + CorruptInput, + DecompressedSizeMismatch, + EndOfStream, + Overflow, }; -} + + /// Takes ownership of `buffer` which may be resized with `gpa`. + /// + /// LZMA was explicitly designed to take advantage of large heap memory + /// being available, with a dictionary size anywhere from 4K to 4G. Thus, + /// this API dynamically allocates the dictionary as-needed. + pub fn initParams( + input: *Reader, + gpa: Allocator, + buffer: []u8, + params: Decode.Params, + mem_limit: usize, + ) !Decompress { + return .{ + .gpa = gpa, + .input = input, + .buffer = Decode.CircularBuffer.init(params.dict_size, mem_limit), + .range_decoder = try RangeDecoder.init(input), + .decode = try Decode.init(gpa, params.properties, params.unpacked_size), + .reader = .{ + .buffer = buffer, + .vtable = &.{ + .readVec = readVec, + .stream = stream, + }, + .seek = 0, + .end = 0, + }, + .err = null, + }; + } + + /// Takes ownership of `buffer` which may be resized with `gpa`. + /// + /// LZMA was explicitly designed to take advantage of large heap memory + /// being available, with a dictionary size anywhere from 4K to 4G. Thus, + /// this API dynamically allocates the dictionary as-needed. + pub fn initOptions( + input: *Reader, + gpa: Allocator, + buffer: []u8, + options: Decode.Options, + mem_limit: usize, + ) !Decompress { + const params = try Decode.Params.readHeader(input, options); + return initParams(input, gpa, buffer, params, mem_limit); + } + + /// Reclaim ownership of the buffer passed to `init`. + pub fn takeBuffer(d: *Decompress) []u8 { + const buffer = d.reader.buffer; + d.reader.buffer = &.{}; + return buffer; + } + + pub fn deinit(d: *Decompress) void { + const gpa = d.gpa; + gpa.free(d.reader.buffer); + d.buffer.deinit(gpa); + d.decode.deinit(gpa); + d.* = undefined; + } + + fn readVec(r: *Reader, data: [][]u8) Reader.Error!usize { + _ = data; + return readIndirect(r); + } + + fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize { + _ = w; + _ = limit; + return readIndirect(r); + } + + fn readIndirect(r: *Reader) Reader.Error!usize { + const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); + const gpa = d.gpa; + var allocating = Writer.Allocating.initOwnedSlice(gpa, r.buffer); + allocating.writer.end = r.end; + defer r.end = allocating.writer.end; + if (d.decode.state == math.maxInt(usize)) return error.EndOfStream; + d.decode.process(d.input, &allocating, &d.buffer, &d.range_decoder) catch |err| switch (err) { + error.WriteFailed => { + d.err = error.OutOfMemory; + return error.ReadFailed; + }, + error.EndOfStream => { + d.err = error.EndOfStream; + return error.ReadFailed; + }, + else => |e| { + d.err = e; + return error.ReadFailed; + }, + }; + return 0; + } +}; test { _ = @import("lzma/test.zig"); diff --git a/lib/std/compress/lzma/test.zig b/lib/std/compress/lzma/test.zig index eafb91b6bb..2514ddb8c3 100644 --- a/lib/std/compress/lzma/test.zig +++ b/lib/std/compress/lzma/test.zig @@ -1,19 +1,19 @@ const std = @import("../../std.zig"); -const lzma = @import("../lzma.zig"); +const lzma = std.compress.lzma; fn testDecompress(compressed: []const u8) ![]u8 { - const allocator = std.testing.allocator; - var stream = std.io.fixedBufferStream(compressed); - var decompressor = try lzma.decompress(allocator, stream.reader()); + const gpa = std.testing.allocator; + var stream: std.Io.Reader = .fixed(compressed); + + var decompressor = try lzma.Decompress.initOptions(&stream, gpa, &.{}, .{}, std.math.maxInt(u32)); defer decompressor.deinit(); - const reader = decompressor.reader(); - return reader.readAllAlloc(allocator, std.math.maxInt(usize)); + return decompressor.reader.allocRemaining(gpa, .unlimited); } fn testDecompressEqual(expected: []const u8, compressed: []const u8) !void { - const allocator = std.testing.allocator; + const gpa = std.testing.allocator; const decomp = try testDecompress(compressed); - defer allocator.free(decomp); + defer gpa.free(decomp); try std.testing.expectEqualSlices(u8, expected, decomp); } @@ -89,11 +89,13 @@ test "too small uncompressed size in header" { } test "reading one byte" { + const gpa = std.testing.allocator; const compressed = @embedFile("testdata/good-known_size-with_eopm.lzma"); - var stream = std.io.fixedBufferStream(compressed); - var decompressor = try lzma.decompress(std.testing.allocator, stream.reader()); + var stream: std.Io.Reader = .fixed(compressed); + var decompressor = try lzma.Decompress.initOptions(&stream, gpa, &.{}, .{}, std.math.maxInt(u32)); defer decompressor.deinit(); - var buffer = [1]u8{0}; - _ = try decompressor.read(buffer[0..]); + var buffer: [1]u8 = undefined; + try decompressor.reader.readSliceAll(&buffer); + try std.testing.expectEqual(72, buffer[0]); } diff --git a/lib/std/compress/lzma2.zig b/lib/std/compress/lzma2.zig index e454e6382e..d880bad21d 100644 --- a/lib/std/compress/lzma2.zig +++ b/lib/std/compress/lzma2.zig @@ -2,6 +2,8 @@ const std = @import("../std.zig"); const Allocator = std.mem.Allocator; const ArrayList = std.ArrayList; const lzma = std.compress.lzma; +const Writer = std.Io.Writer; +const Reader = std.Io.Reader; /// An accumulating buffer for LZ sequences pub const LzAccumBuffer = struct { @@ -14,30 +16,28 @@ pub const LzAccumBuffer = struct { /// Total number of bytes sent through the buffer len: usize, - const Self = @This(); - - pub fn init(memlimit: usize) Self { - return Self{ + pub fn init(memlimit: usize) LzAccumBuffer { + return .{ .buf = .{}, .memlimit = memlimit, .len = 0, }; } - pub fn appendByte(self: *Self, allocator: Allocator, byte: u8) !void { + pub fn appendByte(self: *LzAccumBuffer, allocator: Allocator, byte: u8) !void { try self.buf.append(allocator, byte); self.len += 1; } /// Reset the internal dictionary - pub fn reset(self: *Self, writer: anytype) !void { + pub fn reset(self: *LzAccumBuffer, writer: *Writer) !void { try writer.writeAll(self.buf.items); self.buf.clearRetainingCapacity(); self.len = 0; } /// Retrieve the last byte or return a default - pub fn lastOr(self: Self, lit: u8) u8 { + pub fn lastOr(self: LzAccumBuffer, lit: u8) u8 { const buf_len = self.buf.items.len; return if (buf_len == 0) lit @@ -46,7 +46,7 @@ pub const LzAccumBuffer = struct { } /// Retrieve the n-th last byte - pub fn lastN(self: Self, dist: usize) !u8 { + pub fn lastN(self: LzAccumBuffer, dist: usize) !u8 { const buf_len = self.buf.items.len; if (dist > buf_len) { return error.CorruptInput; @@ -57,10 +57,10 @@ pub const LzAccumBuffer = struct { /// Append a literal pub fn appendLiteral( - self: *Self, + self: *LzAccumBuffer, allocator: Allocator, lit: u8, - writer: anytype, + writer: *Writer, ) !void { _ = writer; if (self.len >= self.memlimit) { @@ -72,11 +72,11 @@ pub const LzAccumBuffer = struct { /// Fetch an LZ sequence (length, distance) from inside the buffer pub fn appendLz( - self: *Self, + self: *LzAccumBuffer, allocator: Allocator, len: usize, dist: usize, - writer: anytype, + writer: *Writer, ) !void { _ = writer; @@ -95,23 +95,23 @@ pub const LzAccumBuffer = struct { self.len += len; } - pub fn finish(self: *Self, writer: anytype) !void { + pub fn finish(self: *LzAccumBuffer, writer: *Writer) !void { try writer.writeAll(self.buf.items); self.buf.clearRetainingCapacity(); } - pub fn deinit(self: *Self, allocator: Allocator) void { + pub fn deinit(self: *LzAccumBuffer, allocator: Allocator) void { self.buf.deinit(allocator); self.* = undefined; } }; pub const Decode = struct { - lzma_state: lzma.Decode, + lzma_decode: lzma.Decode, pub fn init(allocator: Allocator) !Decode { return Decode{ - .lzma_state = try lzma.Decode.init( + .lzma_decode = try lzma.Decode.init( allocator, .{ .lc = 0, @@ -124,15 +124,15 @@ pub const Decode = struct { } pub fn deinit(self: *Decode, allocator: Allocator) void { - self.lzma_state.deinit(allocator); + self.lzma_decode.deinit(allocator); self.* = undefined; } pub fn decompress( self: *Decode, allocator: Allocator, - reader: anytype, - writer: anytype, + reader: *Reader, + writer: *Writer, ) !void { var accum = LzAccumBuffer.init(std.math.maxInt(usize)); defer accum.deinit(allocator); @@ -154,8 +154,8 @@ pub const Decode = struct { fn parseLzma( self: *Decode, allocator: Allocator, - reader: anytype, - writer: anytype, + reader: *Reader, + writer: *Writer, accum: *LzAccumBuffer, status: u8, ) !void { @@ -210,7 +210,7 @@ pub const Decode = struct { } if (reset.state) { - var new_props = self.lzma_state.lzma_props; + var new_props = self.lzma_decode.properties; if (reset.props) { var props = try reader.readByte(); @@ -231,16 +231,16 @@ pub const Decode = struct { new_props = .{ .lc = lc, .lp = lp, .pb = pb }; } - try self.lzma_state.resetState(allocator, new_props); + try self.lzma_decode.resetState(allocator, new_props); } - self.lzma_state.unpacked_size = unpacked_size + accum.len; + self.lzma_decode.unpacked_size = unpacked_size + accum.len; var counter = std.io.countingReader(reader); const counter_reader = counter.reader(); var rangecoder = try lzma.RangeDecoder.init(counter_reader); - while (try self.lzma_state.process(allocator, counter_reader, writer, accum, &rangecoder) == .continue_) {} + while (try self.lzma_decode.process(allocator, counter_reader, writer, accum, &rangecoder) == .continue_) {} if (counter.bytes_read != packed_size) { return error.CorruptInput; @@ -249,8 +249,8 @@ pub const Decode = struct { fn parseUncompressed( allocator: Allocator, - reader: anytype, - writer: anytype, + reader: *Reader, + writer: *Writer, accum: *LzAccumBuffer, reset_dict: bool, ) !void { @@ -267,24 +267,19 @@ pub const Decode = struct { } }; -pub fn decompress( - allocator: Allocator, - reader: anytype, - writer: anytype, -) !void { - var decoder = try Decode.init(allocator); - defer decoder.deinit(allocator); - return decoder.decompress(allocator, reader, writer); -} - -test { +test "decompress hello world stream" { const expected = "Hello\nWorld!\n"; const compressed = &[_]u8{ 0x01, 0x00, 0x05, 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x0A, 0x02, 0x00, 0x06, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x21, 0x0A, 0x00 }; - const allocator = std.testing.allocator; - var decomp = std.array_list.Managed(u8).init(allocator); - defer decomp.deinit(); - var stream = std.io.fixedBufferStream(compressed); - try decompress(allocator, stream.reader(), decomp.writer()); - try std.testing.expectEqualSlices(u8, expected, decomp.items); + const gpa = std.testing.allocator; + + var stream: std.Io.Reader = .fixed(compressed); + + var decode = try Decode.init(gpa, &stream); + defer decode.deinit(gpa); + + const result = try decode.reader.allocRemaining(gpa, .unlimited); + defer gpa.free(result); + + try std.testing.expectEqualStrings(expected, result); } diff --git a/lib/std/compress/xz.zig b/lib/std/compress/xz.zig index 54a6a38506..eef47cd005 100644 --- a/lib/std/compress/xz.zig +++ b/lib/std/compress/xz.zig @@ -1,368 +1,4 @@ -const std = @import("std"); -const Allocator = std.mem.Allocator; -const ArrayList = std.ArrayList; -const Crc32 = std.hash.Crc32; -const Crc64 = std.hash.crc.Crc64Xz; -const Sha256 = std.crypto.hash.sha2.Sha256; -const lzma2 = std.compress.lzma2; - -pub const Check = enum(u4) { - none = 0x00, - crc32 = 0x01, - crc64 = 0x04, - sha256 = 0x0A, - _, -}; - -fn readStreamFlags(reader: anytype, check: *Check) !void { - const reserved1 = try reader.readByte(); - if (reserved1 != 0) return error.CorruptInput; - const byte = try reader.readByte(); - if ((byte >> 4) != 0) return error.CorruptInput; - check.* = @enumFromInt(@as(u4, @truncate(byte))); -} - -pub fn decompress(allocator: Allocator, reader: anytype) !Decompress(@TypeOf(reader)) { - return Decompress(@TypeOf(reader)).init(allocator, reader); -} - -pub fn Decompress(comptime ReaderType: type) type { - return struct { - const Self = @This(); - - pub const Error = ReaderType.Error || Decoder(ReaderType).Error; - pub const Reader = std.io.GenericReader(*Self, Error, read); - - allocator: Allocator, - block_decoder: Decoder(ReaderType), - in_reader: ReaderType, - - fn init(allocator: Allocator, source: ReaderType) !Self { - const magic = try source.readBytesNoEof(6); - if (!std.mem.eql(u8, &magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 })) - return error.BadHeader; - - var check: Check = undefined; - const hash_a = blk: { - var hasher = hashedReader(source, Crc32.init()); - try readStreamFlags(hasher.reader(), &check); - break :blk hasher.hasher.final(); - }; - - const hash_b = try source.readInt(u32, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - - return Self{ - .allocator = allocator, - .block_decoder = try decoder(allocator, source, check), - .in_reader = source, - }; - } - - pub fn deinit(self: *Self) void { - self.block_decoder.deinit(); - } - - pub fn reader(self: *Self) Reader { - return .{ .context = self }; - } - - pub fn read(self: *Self, buffer: []u8) Error!usize { - if (buffer.len == 0) - return 0; - - const r = try self.block_decoder.read(buffer); - if (r != 0) - return r; - - const index_size = blk: { - var hasher = hashedReader(self.in_reader, Crc32.init()); - hasher.hasher.update(&[1]u8{0x00}); - - var counter = std.io.countingReader(hasher.reader()); - counter.bytes_read += 1; - - const counting_reader = counter.reader(); - - const record_count = try std.leb.readUleb128(u64, counting_reader); - if (record_count != self.block_decoder.block_count) - return error.CorruptInput; - - var i: usize = 0; - while (i < record_count) : (i += 1) { - // TODO: validate records - _ = try std.leb.readUleb128(u64, counting_reader); - _ = try std.leb.readUleb128(u64, counting_reader); - } - - while (counter.bytes_read % 4 != 0) { - if (try counting_reader.readByte() != 0) - return error.CorruptInput; - } - - const hash_a = hasher.hasher.final(); - const hash_b = try counting_reader.readInt(u32, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - - break :blk counter.bytes_read; - }; - - const hash_a = try self.in_reader.readInt(u32, .little); - - const hash_b = blk: { - var hasher = hashedReader(self.in_reader, Crc32.init()); - const hashed_reader = hasher.reader(); - - const backward_size = (@as(u64, try hashed_reader.readInt(u32, .little)) + 1) * 4; - if (backward_size != index_size) - return error.CorruptInput; - - var check: Check = undefined; - try readStreamFlags(hashed_reader, &check); - - break :blk hasher.hasher.final(); - }; - - if (hash_a != hash_b) - return error.WrongChecksum; - - const magic = try self.in_reader.readBytesNoEof(2); - if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' })) - return error.CorruptInput; - - return 0; - } - }; -} - -pub fn HashedReader(ReaderType: type, HasherType: type) type { - return struct { - child_reader: ReaderType, - hasher: HasherType, - - pub const Error = ReaderType.Error; - pub const Reader = std.io.GenericReader(*@This(), Error, read); - - pub fn read(self: *@This(), buf: []u8) Error!usize { - const amt = try self.child_reader.read(buf); - self.hasher.update(buf[0..amt]); - return amt; - } - - pub fn reader(self: *@This()) Reader { - return .{ .context = self }; - } - }; -} - -pub fn hashedReader( - reader: anytype, - hasher: anytype, -) HashedReader(@TypeOf(reader), @TypeOf(hasher)) { - return .{ .child_reader = reader, .hasher = hasher }; -} - -const DecodeError = error{ - CorruptInput, - EndOfStream, - EndOfStreamWithNoError, - WrongChecksum, - Unsupported, - Overflow, -}; - -pub fn decoder(allocator: Allocator, reader: anytype, check: Check) !Decoder(@TypeOf(reader)) { - return Decoder(@TypeOf(reader)).init(allocator, reader, check); -} - -pub fn Decoder(comptime ReaderType: type) type { - return struct { - const Self = @This(); - pub const Error = - ReaderType.Error || - DecodeError || - Allocator.Error; - pub const Reader = std.io.GenericReader(*Self, Error, read); - - allocator: Allocator, - inner_reader: ReaderType, - check: Check, - err: ?Error, - to_read: ArrayList(u8), - read_pos: usize, - block_count: usize, - - fn init(allocator: Allocator, in_reader: ReaderType, check: Check) !Self { - return Self{ - .allocator = allocator, - .inner_reader = in_reader, - .check = check, - .err = null, - .to_read = .{}, - .read_pos = 0, - .block_count = 0, - }; - } - - pub fn deinit(self: *Self) void { - self.to_read.deinit(self.allocator); - } - - pub fn reader(self: *Self) Reader { - return .{ .context = self }; - } - - pub fn read(self: *Self, output: []u8) Error!usize { - while (true) { - const unread_len = self.to_read.items.len - self.read_pos; - if (unread_len > 0) { - const n = @min(unread_len, output.len); - @memcpy(output[0..n], self.to_read.items[self.read_pos..][0..n]); - self.read_pos += n; - return n; - } - if (self.err) |e| { - if (e == DecodeError.EndOfStreamWithNoError) { - return 0; - } - return e; - } - if (self.read_pos > 0) { - self.to_read.shrinkRetainingCapacity(0); - self.read_pos = 0; - } - self.readBlock() catch |e| { - self.err = e; - }; - } - } - - fn readBlock(self: *Self) Error!void { - var block_counter = std.io.countingReader(self.inner_reader); - const block_reader = block_counter.reader(); - - var packed_size: ?u64 = null; - var unpacked_size: ?u64 = null; - - // Block Header - { - var header_hasher = hashedReader(block_reader, Crc32.init()); - const header_reader = header_hasher.reader(); - - const header_size = @as(u64, try header_reader.readByte()) * 4; - if (header_size == 0) - return error.EndOfStreamWithNoError; - - const Flags = packed struct(u8) { - last_filter_index: u2, - reserved: u4, - has_packed_size: bool, - has_unpacked_size: bool, - }; - - const flags = @as(Flags, @bitCast(try header_reader.readByte())); - const filter_count = @as(u3, flags.last_filter_index) + 1; - if (filter_count > 1) - return error.Unsupported; - - if (flags.has_packed_size) - packed_size = try std.leb.readUleb128(u64, header_reader); - - if (flags.has_unpacked_size) - unpacked_size = try std.leb.readUleb128(u64, header_reader); - - const FilterId = enum(u64) { - lzma2 = 0x21, - _, - }; - - const filter_id = @as( - FilterId, - @enumFromInt(try std.leb.readUleb128(u64, header_reader)), - ); - - if (@intFromEnum(filter_id) >= 0x4000_0000_0000_0000) - return error.CorruptInput; - - if (filter_id != .lzma2) - return error.Unsupported; - - const properties_size = try std.leb.readUleb128(u64, header_reader); - if (properties_size != 1) - return error.CorruptInput; - - // TODO: use filter properties - _ = try header_reader.readByte(); - - while (block_counter.bytes_read != header_size) { - if (try header_reader.readByte() != 0) - return error.CorruptInput; - } - - const hash_a = header_hasher.hasher.final(); - const hash_b = try header_reader.readInt(u32, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - } - - // Compressed Data - var packed_counter = std.io.countingReader(block_reader); - try lzma2.decompress( - self.allocator, - packed_counter.reader(), - self.to_read.writer(self.allocator), - ); - - if (packed_size) |s| { - if (s != packed_counter.bytes_read) - return error.CorruptInput; - } - - const unpacked_bytes = self.to_read.items; - if (unpacked_size) |s| { - if (s != unpacked_bytes.len) - return error.CorruptInput; - } - - // Block Padding - while (block_counter.bytes_read % 4 != 0) { - if (try block_reader.readByte() != 0) - return error.CorruptInput; - } - - switch (self.check) { - .none => {}, - .crc32 => { - const hash_a = Crc32.hash(unpacked_bytes); - const hash_b = try self.inner_reader.readInt(u32, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - }, - .crc64 => { - const hash_a = Crc64.hash(unpacked_bytes); - const hash_b = try self.inner_reader.readInt(u64, .little); - if (hash_a != hash_b) - return error.WrongChecksum; - }, - .sha256 => { - var hash_a: [Sha256.digest_length]u8 = undefined; - Sha256.hash(unpacked_bytes, &hash_a, .{}); - - var hash_b: [Sha256.digest_length]u8 = undefined; - try self.inner_reader.readNoEof(&hash_b); - - if (!std.mem.eql(u8, &hash_a, &hash_b)) - return error.WrongChecksum; - }, - else => return error.Unsupported, - } - - self.block_count += 1; - } - }; -} +pub const Decompress = @import("xz/Decompress.zig"); test { _ = @import("xz/test.zig"); diff --git a/lib/std/compress/xz/Decompress.zig b/lib/std/compress/xz/Decompress.zig new file mode 100644 index 0000000000..6b925020d6 --- /dev/null +++ b/lib/std/compress/xz/Decompress.zig @@ -0,0 +1,288 @@ +const Decompress = @This(); +const std = @import("../../std.zig"); +const Allocator = std.mem.Allocator; +const ArrayList = std.ArrayList; +const Crc32 = std.hash.Crc32; +const Crc64 = std.hash.crc.Crc64Xz; +const Sha256 = std.crypto.hash.sha2.Sha256; +const lzma2 = std.compress.lzma2; +const Writer = std.Io.Writer; +const Reader = std.Io.Reader; + +/// Underlying compressed data stream to pull bytes from. +input: *Reader, +/// Uncompressed bytes output by this stream implementation. +reader: Reader, +gpa: Allocator, +check: Check, +block_count: usize, +err: ?Error, + +pub const Error = error{ + ReadFailed, + OutOfMemory, + CorruptInput, + EndOfStream, + WrongChecksum, + Unsupported, + Overflow, +}; + +pub const Check = enum(u4) { + none = 0x00, + crc32 = 0x01, + crc64 = 0x04, + sha256 = 0x0A, + _, +}; + +pub const StreamFlags = packed struct(u16) { + null: u8 = 0, + check: Check, + reserved: u4 = 0, +}; + +pub const InitError = error{ + NotXzStream, + WrongChecksum, +}; + +/// XZ uses a series of LZMA2 blocks which each specify a dictionary size +/// anywhere from 4K to 4G. Thus, this API dynamically allocates the dictionary +/// as-needed. +pub fn init( + input: *Reader, + gpa: Allocator, + /// Decompress takes ownership of this buffer and resizes it with `gpa`. + buffer: []u8, +) Decompress { + const magic = try input.takeBytes(6); + if (!std.mem.eql(u8, &magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 })) + return error.NotXzStream; + + const actual_hash = Crc32.hash(try input.peek(@sizeOf(StreamFlags))); + const stream_flags = input.takeStruct(StreamFlags, .little) catch unreachable; + const stored_hash = try input.readInt(u32, .little); + if (actual_hash != stored_hash) return error.WrongChecksum; + + return .{ + .input = input, + .reader = .{ + .vtable = &.{ + .stream = stream, + .readVec = readVec, + }, + .buffer = buffer, + .seek = 0, + .end = 0, + }, + .gpa = gpa, + .check = stream_flags.check, + .block_count = 0, + .err = null, + }; +} + +fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize { + _ = w; + _ = limit; + const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); + _ = d; + @panic("TODO"); +} + +fn readVec(r: *Reader, data: [][]u8) Reader.Error!usize { + _ = data; + const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); + _ = d; + @panic("TODO"); +} + +// if (buffer.len == 0) +// return 0; +// +// const r = try self.block_decode.read(buffer); +// if (r != 0) +// return r; +// +// const index_size = blk: { +// var hasher = hashedReader(self.in_reader, Crc32.init()); +// hasher.hasher.update(&[1]u8{0x00}); +// +// var counter = std.io.countingReader(hasher.reader()); +// counter.bytes_read += 1; +// +// const counting_reader = counter.reader(); +// +// const record_count = try std.leb.readUleb128(u64, counting_reader); +// if (record_count != self.block_decode.block_count) +// return error.CorruptInput; +// +// var i: usize = 0; +// while (i < record_count) : (i += 1) { +// // TODO: validate records +// _ = try std.leb.readUleb128(u64, counting_reader); +// _ = try std.leb.readUleb128(u64, counting_reader); +// } +// +// while (counter.bytes_read % 4 != 0) { +// if (try counting_reader.readByte() != 0) +// return error.CorruptInput; +// } +// +// const hash_a = hasher.hasher.final(); +// const hash_b = try counting_reader.readInt(u32, .little); +// if (hash_a != hash_b) +// return error.WrongChecksum; +// +// break :blk counter.bytes_read; +// }; +// +// const hash_a = try self.in_reader.readInt(u32, .little); +// +// const hash_b = blk: { +// var hasher = hashedReader(self.in_reader, Crc32.init()); +// const hashed_reader = hasher.reader(); +// +// const backward_size = (@as(u64, try hashed_reader.readInt(u32, .little)) + 1) * 4; +// if (backward_size != index_size) +// return error.CorruptInput; +// +// var check: Check = undefined; +// try readStreamFlags(hashed_reader, &check); +// +// break :blk hasher.hasher.final(); +// }; +// +// if (hash_a != hash_b) +// return error.WrongChecksum; +// +// const magic = try self.in_reader.readBytesNoEof(2); +// if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' })) +// return error.CorruptInput; +// +// return 0; +//} + +//fn readBlock(self: *BlockDecode) Error!void { +// var block_counter = std.io.countingReader(self.inner_reader); +// const block_reader = block_counter.reader(); +// +// var packed_size: ?u64 = null; +// var unpacked_size: ?u64 = null; +// +// // Block Header +// { +// var header_hasher = hashedReader(block_reader, Crc32.init()); +// const header_reader = header_hasher.reader(); +// +// const header_size = @as(u64, try header_reader.readByte()) * 4; +// if (header_size == 0) +// return error.EndOfStreamWithNoError; +// +// const Flags = packed struct(u8) { +// last_filter_index: u2, +// reserved: u4, +// has_packed_size: bool, +// has_unpacked_size: bool, +// }; +// +// const flags = @as(Flags, @bitCast(try header_reader.readByte())); +// const filter_count = @as(u3, flags.last_filter_index) + 1; +// if (filter_count > 1) +// return error.Unsupported; +// +// if (flags.has_packed_size) +// packed_size = try std.leb.readUleb128(u64, header_reader); +// +// if (flags.has_unpacked_size) +// unpacked_size = try std.leb.readUleb128(u64, header_reader); +// +// const FilterId = enum(u64) { +// lzma2 = 0x21, +// _, +// }; +// +// const filter_id = @as( +// FilterId, +// @enumFromInt(try std.leb.readUleb128(u64, header_reader)), +// ); +// +// if (@intFromEnum(filter_id) >= 0x4000_0000_0000_0000) +// return error.CorruptInput; +// +// if (filter_id != .lzma2) +// return error.Unsupported; +// +// const properties_size = try std.leb.readUleb128(u64, header_reader); +// if (properties_size != 1) +// return error.CorruptInput; +// +// // TODO: use filter properties +// _ = try header_reader.readByte(); +// +// while (block_counter.bytes_read != header_size) { +// if (try header_reader.readByte() != 0) +// return error.CorruptInput; +// } +// +// const hash_a = header_hasher.hasher.final(); +// const hash_b = try header_reader.readInt(u32, .little); +// if (hash_a != hash_b) +// return error.WrongChecksum; +// } +// +// // Compressed Data +// var packed_counter = std.io.countingReader(block_reader); +// try lzma2.decompress( +// self.allocator, +// packed_counter.reader(), +// self.to_read.writer(self.allocator), +// ); +// +// if (packed_size) |s| { +// if (s != packed_counter.bytes_read) +// return error.CorruptInput; +// } +// +// const unpacked_bytes = self.to_read.items; +// if (unpacked_size) |s| { +// if (s != unpacked_bytes.len) +// return error.CorruptInput; +// } +// +// // Block Padding +// while (block_counter.bytes_read % 4 != 0) { +// if (try block_reader.readByte() != 0) +// return error.CorruptInput; +// } +// +// switch (self.check) { +// .none => {}, +// .crc32 => { +// const hash_a = Crc32.hash(unpacked_bytes); +// const hash_b = try self.inner_reader.readInt(u32, .little); +// if (hash_a != hash_b) +// return error.WrongChecksum; +// }, +// .crc64 => { +// const hash_a = Crc64.hash(unpacked_bytes); +// const hash_b = try self.inner_reader.readInt(u64, .little); +// if (hash_a != hash_b) +// return error.WrongChecksum; +// }, +// .sha256 => { +// var hash_a: [Sha256.digest_length]u8 = undefined; +// Sha256.hash(unpacked_bytes, &hash_a, .{}); +// +// var hash_b: [Sha256.digest_length]u8 = undefined; +// try self.inner_reader.readNoEof(&hash_b); +// +// if (!std.mem.eql(u8, &hash_a, &hash_b)) +// return error.WrongChecksum; +// }, +// else => return error.Unsupported, +// } +// +// self.block_count += 1; +//} From 8523cbef0e0145a1f4de84f49044b8c9a8657108 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 25 Aug 2025 18:47:33 -0700 Subject: [PATCH 04/12] std.compress.lzma: tests passing --- lib/std/compress/lzma.zig | 13 ++++++++++++- lib/std/compress/lzma/test.zig | 13 ++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/lib/std/compress/lzma.zig b/lib/std/compress/lzma.zig index 597e97a94f..0f1ad1a0b8 100644 --- a/lib/std/compress/lzma.zig +++ b/lib/std/compress/lzma.zig @@ -696,6 +696,7 @@ pub const Decompress = struct { .vtable = &.{ .readVec = readVec, .stream = stream, + .discard = discard, }, .seek = 0, .end = 0, @@ -746,12 +747,22 @@ pub const Decompress = struct { return readIndirect(r); } + fn discard(r: *Reader, limit: std.Io.Limit) Reader.Error!usize { + const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); + _ = d; + _ = limit; + @panic("TODO"); + } + fn readIndirect(r: *Reader) Reader.Error!usize { const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); const gpa = d.gpa; var allocating = Writer.Allocating.initOwnedSlice(gpa, r.buffer); allocating.writer.end = r.end; - defer r.end = allocating.writer.end; + defer { + r.buffer = allocating.writer.buffer; + r.end = allocating.writer.end; + } if (d.decode.state == math.maxInt(usize)) return error.EndOfStream; d.decode.process(d.input, &allocating, &d.buffer, &d.range_decoder) catch |err| switch (err) { error.WriteFailed => { diff --git a/lib/std/compress/lzma/test.zig b/lib/std/compress/lzma/test.zig index 2514ddb8c3..a1b4432e85 100644 --- a/lib/std/compress/lzma/test.zig +++ b/lib/std/compress/lzma/test.zig @@ -18,7 +18,14 @@ fn testDecompressEqual(expected: []const u8, compressed: []const u8) !void { } fn testDecompressError(expected: anyerror, compressed: []const u8) !void { - return std.testing.expectError(expected, testDecompress(compressed)); + const gpa = std.testing.allocator; + var stream: std.Io.Reader = .fixed(compressed); + + var decompressor = try lzma.Decompress.initOptions(&stream, gpa, &.{}, .{}, std.math.maxInt(u32)); + defer decompressor.deinit(); + + try std.testing.expectError(error.ReadFailed, decompressor.reader.allocRemaining(gpa, .unlimited)); + try std.testing.expectEqual(expected, decompressor.err orelse return error.TestFailed); } test "decompress empty world" { @@ -76,14 +83,14 @@ test "known size with end of payload marker" { test "too big uncompressed size in header" { try testDecompressError( - error.CorruptInput, + error.DecompressedSizeMismatch, @embedFile("testdata/bad-too_big_size-with_eopm.lzma"), ); } test "too small uncompressed size in header" { try testDecompressError( - error.CorruptInput, + error.DecompressedSizeMismatch, @embedFile("testdata/bad-too_small_size-without_eopm-3.lzma"), ); } From 3cb9baaf65abe6c658a5b8b4bd5c0a424ec923c1 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 25 Aug 2025 18:53:31 -0700 Subject: [PATCH 05/12] std.compress.lzma: delete dead parameter update is always passed as true --- lib/std/compress/lzma.zig | 146 ++++++++++++++------------------------ 1 file changed, 55 insertions(+), 91 deletions(-) diff --git a/lib/std/compress/lzma.zig b/lib/std/compress/lzma.zig index 0f1ad1a0b8..5f96579be5 100644 --- a/lib/std/compress/lzma.zig +++ b/lib/std/compress/lzma.zig @@ -50,19 +50,17 @@ pub const RangeDecoder = struct { return result; } - pub fn decodeBit(self: *RangeDecoder, reader: *Reader, prob: *u16, update: bool) !bool { + pub fn decodeBit(self: *RangeDecoder, reader: *Reader, prob: *u16) !bool { const bound = (self.range >> 11) * prob.*; if (self.code < bound) { - if (update) - prob.* += (0x800 - prob.*) >> 5; + prob.* += (0x800 - prob.*) >> 5; self.range = bound; try self.normalize(reader); return false; } else { - if (update) - prob.* -= prob.* >> 5; + prob.* -= prob.* >> 5; self.code -= bound; self.range -= bound; @@ -76,12 +74,11 @@ pub const RangeDecoder = struct { reader: *Reader, num_bits: u5, probs: []u16, - update: bool, ) !u32 { var tmp: u32 = 1; var i: @TypeOf(num_bits) = 0; while (i < num_bits) : (i += 1) { - const bit = try self.decodeBit(reader, &probs[tmp], update); + const bit = try self.decodeBit(reader, &probs[tmp]); tmp = (tmp << 1) ^ @intFromBool(bit); } return tmp - (@as(u32, 1) << num_bits); @@ -93,13 +90,12 @@ pub const RangeDecoder = struct { num_bits: u5, probs: []u16, offset: usize, - update: bool, ) !u32 { var result: u32 = 0; var tmp: usize = 1; var i: @TypeOf(num_bits) = 0; while (i < num_bits) : (i += 1) { - const bit = @intFromBool(try self.decodeBit(reader, &probs[offset + tmp], update)); + const bit = @intFromBool(try self.decodeBit(reader, &probs[offset + tmp])); tmp = (tmp << 1) ^ bit; result ^= @as(u32, bit) << i; } @@ -186,97 +182,76 @@ pub const Decode = struct { allocating: *Writer.Allocating, buffer: *CircularBuffer, decoder: *RangeDecoder, - update: bool, ) !ProcessingStatus { const gpa = allocating.allocator; const writer = &allocating.writer; const pos_state = buffer.len & ((@as(usize, 1) << self.properties.pb) - 1); - if (!try decoder.decodeBit(reader, &self.is_match[(self.state << 4) + pos_state], update)) { - const byte: u8 = try self.decodeLiteral(reader, buffer, decoder, update); + if (!try decoder.decodeBit(reader, &self.is_match[(self.state << 4) + pos_state])) { + const byte: u8 = try self.decodeLiteral(reader, buffer, decoder); - if (update) { - try buffer.appendLiteral(gpa, byte, writer); + try buffer.appendLiteral(gpa, byte, writer); - self.state = if (self.state < 4) - 0 - else if (self.state < 10) - self.state - 3 - else - self.state - 6; - } + self.state = if (self.state < 4) + 0 + else if (self.state < 10) + self.state - 3 + else + self.state - 6; return .more; } var len: usize = undefined; - if (try decoder.decodeBit(reader, &self.is_rep[self.state], update)) { - if (!try decoder.decodeBit(reader, &self.is_rep_g0[self.state], update)) { - if (!try decoder.decodeBit( - reader, - &self.is_rep_0long[(self.state << 4) + pos_state], - update, - )) { - if (update) { - self.state = if (self.state < 7) 9 else 11; - const dist = self.rep[0] + 1; - try buffer.appendLz(gpa, 1, dist, writer); - } + if (try decoder.decodeBit(reader, &self.is_rep[self.state])) { + if (!try decoder.decodeBit(reader, &self.is_rep_g0[self.state])) { + if (!try decoder.decodeBit(reader, &self.is_rep_0long[(self.state << 4) + pos_state])) { + self.state = if (self.state < 7) 9 else 11; + const dist = self.rep[0] + 1; + try buffer.appendLz(gpa, 1, dist, writer); return .more; } } else { - const idx: usize = if (!try decoder.decodeBit(reader, &self.is_rep_g1[self.state], update)) + const idx: usize = if (!try decoder.decodeBit(reader, &self.is_rep_g1[self.state])) 1 - else if (!try decoder.decodeBit(reader, &self.is_rep_g2[self.state], update)) + else if (!try decoder.decodeBit(reader, &self.is_rep_g2[self.state])) 2 else 3; - if (update) { - const dist = self.rep[idx]; - var i = idx; - while (i > 0) : (i -= 1) { - self.rep[i] = self.rep[i - 1]; - } - self.rep[0] = dist; + const dist = self.rep[idx]; + var i = idx; + while (i > 0) : (i -= 1) { + self.rep[i] = self.rep[i - 1]; } + self.rep[0] = dist; } - len = try self.rep_len_decoder.decode(reader, decoder, pos_state, update); + len = try self.rep_len_decoder.decode(reader, decoder, pos_state); - if (update) { - self.state = if (self.state < 7) 8 else 11; - } + self.state = if (self.state < 7) 8 else 11; } else { - if (update) { - self.rep[3] = self.rep[2]; - self.rep[2] = self.rep[1]; - self.rep[1] = self.rep[0]; - } + self.rep[3] = self.rep[2]; + self.rep[2] = self.rep[1]; + self.rep[1] = self.rep[0]; - len = try self.len_decoder.decode(reader, decoder, pos_state, update); + len = try self.len_decoder.decode(reader, decoder, pos_state); - if (update) { - self.state = if (self.state < 7) 7 else 10; - } + self.state = if (self.state < 7) 7 else 10; - const rep_0 = try self.decodeDistance(reader, decoder, len, update); + const rep_0 = try self.decodeDistance(reader, decoder, len); - if (update) { - self.rep[0] = rep_0; - if (self.rep[0] == 0xFFFF_FFFF) { - if (decoder.isFinished()) { - return .finished; - } - return error.CorruptInput; + self.rep[0] = rep_0; + if (self.rep[0] == 0xFFFF_FFFF) { + if (decoder.isFinished()) { + return .finished; } + return error.CorruptInput; } } - if (update) { - len += 2; + len += 2; - const dist = self.rep[0] + 1; - try buffer.appendLz(gpa, len, dist, writer); - } + const dist = self.rep[0] + 1; + try buffer.appendLz(gpa, len, dist, writer); return .more; } @@ -296,7 +271,7 @@ pub const Decode = struct { } else if (decoder.isFinished()) { break :process_next; } - switch (try self.processNext(reader, allocating, buffer, decoder, true)) { + switch (try self.processNext(reader, allocating, buffer, decoder)) { .more => return, .finished => {}, } @@ -315,7 +290,6 @@ pub const Decode = struct { reader: *Reader, buffer: *CircularBuffer, decoder: *RangeDecoder, - update: bool, ) !u8 { const def_prev_byte = 0; const prev_byte = @as(usize, buffer.lastOr(def_prev_byte)); @@ -334,7 +308,6 @@ pub const Decode = struct { const bit = @intFromBool(try decoder.decodeBit( reader, &probs[((@as(usize, 1) + match_bit) << 8) + result], - update, )); result = (result << 1) ^ bit; if (match_bit != bit) { @@ -344,7 +317,7 @@ pub const Decode = struct { } while (result < 0x100) { - result = (result << 1) ^ @intFromBool(try decoder.decodeBit(reader, &probs[result], update)); + result = (result << 1) ^ @intFromBool(try decoder.decodeBit(reader, &probs[result])); } return @as(u8, @truncate(result - 0x100)); @@ -355,11 +328,10 @@ pub const Decode = struct { reader: *Reader, decoder: *RangeDecoder, length: usize, - update: bool, ) !usize { const len_state = if (length > 3) 3 else length; - const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(reader, decoder, update)); + const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(reader, decoder)); if (pos_slot < 4) return pos_slot; @@ -372,11 +344,10 @@ pub const Decode = struct { num_direct_bits, &self.pos_decoders, result - pos_slot, - update, ); } else { result += @as(usize, try decoder.get(reader, num_direct_bits - 4)) << 4; - result += try self.align_decoder.parseReverse(reader, decoder, update); + result += try self.align_decoder.parseReverse(reader, decoder); } return result; @@ -500,22 +471,16 @@ pub const Decode = struct { return struct { probs: [1 << num_bits]u16 = @splat(0x400), - pub fn parse( - self: *@This(), - reader: *Reader, - decoder: *RangeDecoder, - update: bool, - ) !u32 { - return decoder.parseBitTree(reader, num_bits, &self.probs, update); + pub fn parse(self: *@This(), reader: *Reader, decoder: *RangeDecoder) !u32 { + return decoder.parseBitTree(reader, num_bits, &self.probs); } pub fn parseReverse( self: *@This(), reader: *Reader, decoder: *RangeDecoder, - update: bool, ) !u32 { - return decoder.parseReverseBitTree(reader, num_bits, &self.probs, 0, update); + return decoder.parseReverseBitTree(reader, num_bits, &self.probs, 0); } pub fn reset(self: *@This()) void { @@ -536,14 +501,13 @@ pub const Decode = struct { reader: *Reader, decoder: *RangeDecoder, pos_state: usize, - update: bool, ) !usize { - if (!try decoder.decodeBit(reader, &self.choice, update)) { - return @as(usize, try self.low_coder[pos_state].parse(reader, decoder, update)); - } else if (!try decoder.decodeBit(reader, &self.choice2, update)) { - return @as(usize, try self.mid_coder[pos_state].parse(reader, decoder, update)) + 8; + if (!try decoder.decodeBit(reader, &self.choice)) { + return @as(usize, try self.low_coder[pos_state].parse(reader, decoder)); + } else if (!try decoder.decodeBit(reader, &self.choice2)) { + return @as(usize, try self.mid_coder[pos_state].parse(reader, decoder)) + 8; } else { - return @as(usize, try self.high_coder.parse(reader, decoder, update)) + 16; + return @as(usize, try self.high_coder.parse(reader, decoder)) + 16; } } From a8ae6c2f4265a66c7a63d788a13549c48a1dd8c0 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 25 Aug 2025 20:24:19 -0700 Subject: [PATCH 06/12] std.compress.lzma2: tests passing --- lib/std/compress/lzma.zig | 126 ++++++++++++++---------------- lib/std/compress/lzma2.zig | 156 +++++++++++++++++-------------------- 2 files changed, 131 insertions(+), 151 deletions(-) diff --git a/lib/std/compress/lzma.zig b/lib/std/compress/lzma.zig index 5f96579be5..ab7e217871 100644 --- a/lib/std/compress/lzma.zig +++ b/lib/std/compress/lzma.zig @@ -105,7 +105,6 @@ pub const RangeDecoder = struct { pub const Decode = struct { properties: Properties, - unpacked_size: ?u64, literal_probs: Vec2d, pos_slot_decoder: [4]BitTree(6), align_decoder: BitTree(4), @@ -121,15 +120,10 @@ pub const Decode = struct { len_decoder: LenDecoder, rep_len_decoder: LenDecoder, - pub fn init( - gpa: Allocator, - properties: Properties, - unpacked_size: ?u64, - ) !Decode { + pub fn init(gpa: Allocator, properties: Properties) !Decode { return .{ .properties = properties, - .unpacked_size = unpacked_size, - .literal_probs = try Vec2d.init(gpa, 0x400, .{ @as(usize, 1) << (properties.lc + properties.lp), 0x300 }), + .literal_probs = try Vec2d.init(gpa, 0x400, @as(usize, 1) << (properties.lc + properties.lp), 0x300), .pos_slot_decoder = @splat(.{}), .align_decoder = .{}, .pos_decoders = @splat(0x400), @@ -157,7 +151,7 @@ pub const Decode = struct { self.literal_probs.fill(0x400); } else { self.literal_probs.deinit(gpa); - self.literal_probs = try Vec2d.init(gpa, 0x400, .{ @as(usize, 1) << (new_props.lc + new_props.lp), 0x300 }); + self.literal_probs = try Vec2d.init(gpa, 0x400, @as(usize, 1) << (new_props.lc + new_props.lp), 0x300); } self.properties = new_props; @@ -176,11 +170,12 @@ pub const Decode = struct { self.rep_len_decoder.reset(); } - fn processNext( + pub fn process( self: *Decode, reader: *Reader, allocating: *Writer.Allocating, - buffer: *CircularBuffer, + /// `CircularBuffer` or `std.compress.lzma2.AccumBuffer`. + buffer: anytype, decoder: *RangeDecoder, ) !ProcessingStatus { const gpa = allocating.allocator; @@ -256,39 +251,11 @@ pub const Decode = struct { return .more; } - pub fn process( - self: *Decode, - reader: *Reader, - allocating: *Writer.Allocating, - buffer: *CircularBuffer, - decoder: *RangeDecoder, - ) !void { - process_next: { - if (self.unpacked_size) |unpacked_size| { - if (buffer.len >= unpacked_size) { - break :process_next; - } - } else if (decoder.isFinished()) { - break :process_next; - } - switch (try self.processNext(reader, allocating, buffer, decoder)) { - .more => return, - .finished => {}, - } - } - - if (self.unpacked_size) |unpacked_size| { - if (buffer.len != unpacked_size) return error.DecompressedSizeMismatch; - } - - try buffer.finish(&allocating.writer); - self.state = math.maxInt(usize); - } - fn decodeLiteral( self: *Decode, reader: *Reader, - buffer: *CircularBuffer, + /// `CircularBuffer` or `std.compress.lzma2.AccumBuffer`. + buffer: anytype, decoder: *RangeDecoder, ) !u8 { const def_prev_byte = 0; @@ -377,10 +344,7 @@ pub const Decode = struct { } pub fn get(self: CircularBuffer, index: usize) u8 { - return if (0 <= index and index < self.buf.items.len) - self.buf.items[index] - else - 0; + return if (0 <= index and index < self.buf.items.len) self.buf.items[index] else 0; } pub fn set(self: *CircularBuffer, gpa: Allocator, index: usize, value: u8) !void { @@ -524,29 +488,29 @@ pub const Decode = struct { data: []u16, cols: usize, - pub fn init(gpa: Allocator, value: u16, size: struct { usize, usize }) !Vec2d { - const len = try math.mul(usize, size[0], size[1]); + pub fn init(gpa: Allocator, value: u16, w: usize, h: usize) !Vec2d { + const len = try math.mul(usize, w, h); const data = try gpa.alloc(u16, len); @memset(data, value); return .{ .data = data, - .cols = size[1], + .cols = h, }; } - pub fn deinit(self: *Vec2d, gpa: Allocator) void { - gpa.free(self.data); - self.* = undefined; + pub fn deinit(v: *Vec2d, gpa: Allocator) void { + gpa.free(v.data); + v.* = undefined; } - pub fn fill(self: *Vec2d, value: u16) void { - @memset(self.data, value); + pub fn fill(v: *Vec2d, value: u16) void { + @memset(v.data, value); } - fn get(self: Vec2d, row: usize) ![]u16 { - const start_row = try math.mul(usize, row, self.cols); - const end_row = try math.add(usize, start_row, self.cols); - return self.data[start_row..end_row]; + fn get(v: Vec2d, row: usize) ![]u16 { + const start_row = try math.mul(usize, row, v.cols); + const end_row = try math.add(usize, start_row, v.cols); + return v.data[start_row..end_row]; } }; @@ -627,6 +591,7 @@ pub const Decompress = struct { range_decoder: RangeDecoder, decode: Decode, err: ?Error, + unpacked_size: ?u64, pub const Error = error{ OutOfMemory, @@ -654,7 +619,7 @@ pub const Decompress = struct { .input = input, .buffer = Decode.CircularBuffer.init(params.dict_size, mem_limit), .range_decoder = try RangeDecoder.init(input), - .decode = try Decode.init(gpa, params.properties, params.unpacked_size), + .decode = try Decode.init(gpa, params.properties), .reader = .{ .buffer = buffer, .vtable = &.{ @@ -666,6 +631,7 @@ pub const Decompress = struct { .end = 0, }, .err = null, + .unpacked_size = params.unpacked_size, }; } @@ -728,20 +694,46 @@ pub const Decompress = struct { r.end = allocating.writer.end; } if (d.decode.state == math.maxInt(usize)) return error.EndOfStream; - d.decode.process(d.input, &allocating, &d.buffer, &d.range_decoder) catch |err| switch (err) { + + process_next: { + if (d.unpacked_size) |unpacked_size| { + if (d.buffer.len >= unpacked_size) break :process_next; + } else if (d.range_decoder.isFinished()) { + break :process_next; + } + switch (d.decode.process(d.input, &allocating, &d.buffer, &d.range_decoder) catch |err| switch (err) { + error.WriteFailed => { + d.err = error.OutOfMemory; + return error.ReadFailed; + }, + error.EndOfStream => { + d.err = error.EndOfStream; + return error.ReadFailed; + }, + else => |e| { + d.err = e; + return error.ReadFailed; + }, + }) { + .more => return 0, + .finished => break :process_next, + } + } + + if (d.unpacked_size) |unpacked_size| { + if (d.buffer.len != unpacked_size) { + d.err = error.DecompressedSizeMismatch; + return error.ReadFailed; + } + } + + d.buffer.finish(&allocating.writer) catch |err| switch (err) { error.WriteFailed => { d.err = error.OutOfMemory; return error.ReadFailed; }, - error.EndOfStream => { - d.err = error.EndOfStream; - return error.ReadFailed; - }, - else => |e| { - d.err = e; - return error.ReadFailed; - }, }; + d.decode.state = math.maxInt(usize); return 0; } }; diff --git a/lib/std/compress/lzma2.zig b/lib/std/compress/lzma2.zig index d880bad21d..6aad2e1198 100644 --- a/lib/std/compress/lzma2.zig +++ b/lib/std/compress/lzma2.zig @@ -6,17 +6,15 @@ const Writer = std.Io.Writer; const Reader = std.Io.Reader; /// An accumulating buffer for LZ sequences -pub const LzAccumBuffer = struct { +pub const AccumBuffer = struct { /// Buffer buf: ArrayList(u8), - /// Buffer memory limit memlimit: usize, - /// Total number of bytes sent through the buffer len: usize, - pub fn init(memlimit: usize) LzAccumBuffer { + pub fn init(memlimit: usize) AccumBuffer { return .{ .buf = .{}, .memlimit = memlimit, @@ -24,20 +22,20 @@ pub const LzAccumBuffer = struct { }; } - pub fn appendByte(self: *LzAccumBuffer, allocator: Allocator, byte: u8) !void { + pub fn appendByte(self: *AccumBuffer, allocator: Allocator, byte: u8) !void { try self.buf.append(allocator, byte); self.len += 1; } /// Reset the internal dictionary - pub fn reset(self: *LzAccumBuffer, writer: *Writer) !void { + pub fn reset(self: *AccumBuffer, writer: *Writer) !void { try writer.writeAll(self.buf.items); self.buf.clearRetainingCapacity(); self.len = 0; } /// Retrieve the last byte or return a default - pub fn lastOr(self: LzAccumBuffer, lit: u8) u8 { + pub fn lastOr(self: AccumBuffer, lit: u8) u8 { const buf_len = self.buf.items.len; return if (buf_len == 0) lit @@ -46,7 +44,7 @@ pub const LzAccumBuffer = struct { } /// Retrieve the n-th last byte - pub fn lastN(self: LzAccumBuffer, dist: usize) !u8 { + pub fn lastN(self: AccumBuffer, dist: usize) !u8 { const buf_len = self.buf.items.len; if (dist > buf_len) { return error.CorruptInput; @@ -57,7 +55,7 @@ pub const LzAccumBuffer = struct { /// Append a literal pub fn appendLiteral( - self: *LzAccumBuffer, + self: *AccumBuffer, allocator: Allocator, lit: u8, writer: *Writer, @@ -72,7 +70,7 @@ pub const LzAccumBuffer = struct { /// Fetch an LZ sequence (length, distance) from inside the buffer pub fn appendLz( - self: *LzAccumBuffer, + self: *AccumBuffer, allocator: Allocator, len: usize, dist: usize, @@ -95,12 +93,12 @@ pub const LzAccumBuffer = struct { self.len += len; } - pub fn finish(self: *LzAccumBuffer, writer: *Writer) !void { + pub fn finish(self: *AccumBuffer, writer: *Writer) !void { try writer.writeAll(self.buf.items); self.buf.clearRetainingCapacity(); } - pub fn deinit(self: *LzAccumBuffer, allocator: Allocator) void { + pub fn deinit(self: *AccumBuffer, allocator: Allocator) void { self.buf.deinit(allocator); self.* = undefined; } @@ -109,59 +107,43 @@ pub const LzAccumBuffer = struct { pub const Decode = struct { lzma_decode: lzma.Decode, - pub fn init(allocator: Allocator) !Decode { - return Decode{ - .lzma_decode = try lzma.Decode.init( - allocator, - .{ - .lc = 0, - .lp = 0, - .pb = 0, - }, - null, - ), - }; + pub fn init(gpa: Allocator) !Decode { + return .{ .lzma_decode = try lzma.Decode.init(gpa, .{ .lc = 0, .lp = 0, .pb = 0 }) }; } - pub fn deinit(self: *Decode, allocator: Allocator) void { - self.lzma_decode.deinit(allocator); + pub fn deinit(self: *Decode, gpa: Allocator) void { + self.lzma_decode.deinit(gpa); self.* = undefined; } - pub fn decompress( - self: *Decode, - allocator: Allocator, - reader: *Reader, - writer: *Writer, - ) !void { - var accum = LzAccumBuffer.init(std.math.maxInt(usize)); - defer accum.deinit(allocator); + pub fn decompress(d: *Decode, reader: *Reader, allocating: *Writer.Allocating) !void { + const gpa = allocating.allocator; + + var accum = AccumBuffer.init(std.math.maxInt(usize)); + defer accum.deinit(gpa); while (true) { - const status = try reader.readByte(); + const status = try reader.takeByte(); switch (status) { 0 => break, - 1 => try parseUncompressed(allocator, reader, writer, &accum, true), - 2 => try parseUncompressed(allocator, reader, writer, &accum, false), - else => try self.parseLzma(allocator, reader, writer, &accum, status), + 1 => try parseUncompressed(reader, allocating, &accum, true), + 2 => try parseUncompressed(reader, allocating, &accum, false), + else => try d.parseLzma(reader, allocating, &accum, status), } } - try accum.finish(writer); + try accum.finish(&allocating.writer); } fn parseLzma( - self: *Decode, - allocator: Allocator, + d: *Decode, reader: *Reader, - writer: *Writer, - accum: *LzAccumBuffer, + allocating: *Writer.Allocating, + accum: *AccumBuffer, status: u8, ) !void { - if (status & 0x80 == 0) { - return error.CorruptInput; - } + if (status & 0x80 == 0) return error.CorruptInput; const Reset = struct { dict: bool, @@ -169,23 +151,23 @@ pub const Decode = struct { props: bool, }; - const reset = switch ((status >> 5) & 0x3) { - 0 => Reset{ + const reset: Reset = switch ((status >> 5) & 0x3) { + 0 => .{ .dict = false, .state = false, .props = false, }, - 1 => Reset{ + 1 => .{ .dict = false, .state = true, .props = false, }, - 2 => Reset{ + 2 => .{ .dict = false, .state = true, .props = true, }, - 3 => Reset{ + 3 => .{ .dict = true, .state = true, .props = true, @@ -196,24 +178,24 @@ pub const Decode = struct { const unpacked_size = blk: { var tmp: u64 = status & 0x1F; tmp <<= 16; - tmp |= try reader.readInt(u16, .big); + tmp |= try reader.takeInt(u16, .big); break :blk tmp + 1; }; const packed_size = blk: { - const tmp: u17 = try reader.readInt(u16, .big); + const tmp: u17 = try reader.takeInt(u16, .big); break :blk tmp + 1; }; - if (reset.dict) { - try accum.reset(writer); - } + if (reset.dict) try accum.reset(&allocating.writer); + + const ld = &d.lzma_decode; if (reset.state) { - var new_props = self.lzma_decode.properties; + var new_props = ld.properties; if (reset.props) { - var props = try reader.readByte(); + var props = try reader.takeByte(); if (props >= 225) { return error.CorruptInput; } @@ -231,38 +213,44 @@ pub const Decode = struct { new_props = .{ .lc = lc, .lp = lp, .pb = pb }; } - try self.lzma_decode.resetState(allocator, new_props); + try ld.resetState(allocating.allocator, new_props); } - self.lzma_decode.unpacked_size = unpacked_size + accum.len; + var range_decoder = try lzma.RangeDecoder.init(reader); - var counter = std.io.countingReader(reader); - const counter_reader = counter.reader(); - - var rangecoder = try lzma.RangeDecoder.init(counter_reader); - while (try self.lzma_decode.process(allocator, counter_reader, writer, accum, &rangecoder) == .continue_) {} - - if (counter.bytes_read != packed_size) { - return error.CorruptInput; + while (true) { + if (accum.len >= unpacked_size) break; + if (range_decoder.isFinished()) break; + switch (try ld.process(reader, allocating, accum, &range_decoder)) { + .more => continue, + .finished => break, + } } + if (accum.len != unpacked_size) return error.DecompressedSizeMismatch; + + // TODO restore this error + //if (counter.bytes_read != packed_size) { + // return error.CorruptInput; + //} + _ = packed_size; } fn parseUncompressed( - allocator: Allocator, reader: *Reader, - writer: *Writer, - accum: *LzAccumBuffer, + allocating: *Writer.Allocating, + accum: *AccumBuffer, reset_dict: bool, ) !void { - const unpacked_size = @as(u17, try reader.readInt(u16, .big)) + 1; + const unpacked_size = @as(u17, try reader.takeInt(u16, .big)) + 1; - if (reset_dict) { - try accum.reset(writer); - } + if (reset_dict) try accum.reset(&allocating.writer); - var i: @TypeOf(unpacked_size) = 0; - while (i < unpacked_size) : (i += 1) { - try accum.appendByte(allocator, try reader.readByte()); + const gpa = allocating.allocator; + + var i = unpacked_size; + while (i != 0) { + try accum.appendByte(gpa, try reader.takeByte()); + i -= 1; } } }; @@ -273,13 +261,13 @@ test "decompress hello world stream" { const gpa = std.testing.allocator; - var stream: std.Io.Reader = .fixed(compressed); - - var decode = try Decode.init(gpa, &stream); + var decode = try Decode.init(gpa); defer decode.deinit(gpa); - const result = try decode.reader.allocRemaining(gpa, .unlimited); - defer gpa.free(result); + var stream: std.Io.Reader = .fixed(compressed); + var result: std.Io.Writer.Allocating = .init(gpa); + defer result.deinit(); - try std.testing.expectEqualStrings(expected, result); + try decode.decompress(&stream, &result); + try std.testing.expectEqualStrings(expected, result.written()); } From d87eb7d4e4f2ea606a18640fcc019b60cc435cdd Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 25 Aug 2025 21:33:57 -0700 Subject: [PATCH 07/12] std.compress.xz: skeleton in place missing these things: - implementation of finish() - detect packed bytes read for check and block padding - implementation of discard() - implementation of block stream checksum --- lib/std/compress/xz/Decompress.zig | 423 +++++++++++++++-------------- lib/std/compress/xz/test.zig | 16 +- 2 files changed, 236 insertions(+), 203 deletions(-) diff --git a/lib/std/compress/xz/Decompress.zig b/lib/std/compress/xz/Decompress.zig index 6b925020d6..99cfa266b1 100644 --- a/lib/std/compress/xz/Decompress.zig +++ b/lib/std/compress/xz/Decompress.zig @@ -26,6 +26,8 @@ pub const Error = error{ WrongChecksum, Unsupported, Overflow, + InvalidRangeCode, + DecompressedSizeMismatch, }; pub const Check = enum(u4) { @@ -55,14 +57,14 @@ pub fn init( gpa: Allocator, /// Decompress takes ownership of this buffer and resizes it with `gpa`. buffer: []u8, -) Decompress { - const magic = try input.takeBytes(6); - if (!std.mem.eql(u8, &magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 })) +) !Decompress { + const magic = try input.takeArray(6); + if (!std.mem.eql(u8, magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 })) return error.NotXzStream; const actual_hash = Crc32.hash(try input.peek(@sizeOf(StreamFlags))); const stream_flags = input.takeStruct(StreamFlags, .little) catch unreachable; - const stored_hash = try input.readInt(u32, .little); + const stored_hash = try input.takeInt(u32, .little); if (actual_hash != stored_hash) return error.WrongChecksum; return .{ @@ -71,6 +73,7 @@ pub fn init( .vtable = &.{ .stream = stream, .readVec = readVec, + .discard = discard, }, .buffer = buffer, .seek = 0, @@ -83,206 +86,232 @@ pub fn init( }; } -fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize { - _ = w; - _ = limit; - const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); - _ = d; - @panic("TODO"); +/// Reclaim ownership of the buffer passed to `init`. +pub fn takeBuffer(d: *Decompress) []u8 { + const buffer = d.reader.buffer; + d.reader.buffer = &.{}; + return buffer; +} + +pub fn deinit(d: *Decompress) void { + const gpa = d.gpa; + gpa.free(d.reader.buffer); + d.* = undefined; } fn readVec(r: *Reader, data: [][]u8) Reader.Error!usize { _ = data; + return readIndirect(r); +} + +fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize { + _ = w; + _ = limit; + return readIndirect(r); +} + +fn discard(r: *Reader, limit: std.Io.Limit) Reader.Error!usize { const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); _ = d; + _ = limit; @panic("TODO"); } -// if (buffer.len == 0) -// return 0; -// -// const r = try self.block_decode.read(buffer); -// if (r != 0) -// return r; -// -// const index_size = blk: { -// var hasher = hashedReader(self.in_reader, Crc32.init()); -// hasher.hasher.update(&[1]u8{0x00}); -// -// var counter = std.io.countingReader(hasher.reader()); -// counter.bytes_read += 1; -// -// const counting_reader = counter.reader(); -// -// const record_count = try std.leb.readUleb128(u64, counting_reader); -// if (record_count != self.block_decode.block_count) -// return error.CorruptInput; -// -// var i: usize = 0; -// while (i < record_count) : (i += 1) { -// // TODO: validate records -// _ = try std.leb.readUleb128(u64, counting_reader); -// _ = try std.leb.readUleb128(u64, counting_reader); -// } -// -// while (counter.bytes_read % 4 != 0) { -// if (try counting_reader.readByte() != 0) -// return error.CorruptInput; -// } -// -// const hash_a = hasher.hasher.final(); -// const hash_b = try counting_reader.readInt(u32, .little); -// if (hash_a != hash_b) -// return error.WrongChecksum; -// -// break :blk counter.bytes_read; -// }; -// -// const hash_a = try self.in_reader.readInt(u32, .little); -// -// const hash_b = blk: { -// var hasher = hashedReader(self.in_reader, Crc32.init()); -// const hashed_reader = hasher.reader(); -// -// const backward_size = (@as(u64, try hashed_reader.readInt(u32, .little)) + 1) * 4; -// if (backward_size != index_size) -// return error.CorruptInput; -// -// var check: Check = undefined; -// try readStreamFlags(hashed_reader, &check); -// -// break :blk hasher.hasher.final(); -// }; -// -// if (hash_a != hash_b) -// return error.WrongChecksum; -// -// const magic = try self.in_reader.readBytesNoEof(2); -// if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' })) -// return error.CorruptInput; -// -// return 0; -//} +fn readIndirect(r: *Reader) Reader.Error!usize { + const d: *Decompress = @alignCast(@fieldParentPtr("reader", r)); + const gpa = d.gpa; + const input = d.input; -//fn readBlock(self: *BlockDecode) Error!void { -// var block_counter = std.io.countingReader(self.inner_reader); -// const block_reader = block_counter.reader(); -// -// var packed_size: ?u64 = null; -// var unpacked_size: ?u64 = null; -// -// // Block Header -// { -// var header_hasher = hashedReader(block_reader, Crc32.init()); -// const header_reader = header_hasher.reader(); -// -// const header_size = @as(u64, try header_reader.readByte()) * 4; -// if (header_size == 0) -// return error.EndOfStreamWithNoError; -// -// const Flags = packed struct(u8) { -// last_filter_index: u2, -// reserved: u4, -// has_packed_size: bool, -// has_unpacked_size: bool, -// }; -// -// const flags = @as(Flags, @bitCast(try header_reader.readByte())); -// const filter_count = @as(u3, flags.last_filter_index) + 1; -// if (filter_count > 1) -// return error.Unsupported; -// -// if (flags.has_packed_size) -// packed_size = try std.leb.readUleb128(u64, header_reader); -// -// if (flags.has_unpacked_size) -// unpacked_size = try std.leb.readUleb128(u64, header_reader); -// -// const FilterId = enum(u64) { -// lzma2 = 0x21, -// _, -// }; -// -// const filter_id = @as( -// FilterId, -// @enumFromInt(try std.leb.readUleb128(u64, header_reader)), -// ); -// -// if (@intFromEnum(filter_id) >= 0x4000_0000_0000_0000) -// return error.CorruptInput; -// -// if (filter_id != .lzma2) -// return error.Unsupported; -// -// const properties_size = try std.leb.readUleb128(u64, header_reader); -// if (properties_size != 1) -// return error.CorruptInput; -// -// // TODO: use filter properties -// _ = try header_reader.readByte(); -// -// while (block_counter.bytes_read != header_size) { -// if (try header_reader.readByte() != 0) -// return error.CorruptInput; -// } -// -// const hash_a = header_hasher.hasher.final(); -// const hash_b = try header_reader.readInt(u32, .little); -// if (hash_a != hash_b) -// return error.WrongChecksum; -// } -// -// // Compressed Data -// var packed_counter = std.io.countingReader(block_reader); -// try lzma2.decompress( -// self.allocator, -// packed_counter.reader(), -// self.to_read.writer(self.allocator), -// ); -// -// if (packed_size) |s| { -// if (s != packed_counter.bytes_read) -// return error.CorruptInput; -// } -// -// const unpacked_bytes = self.to_read.items; -// if (unpacked_size) |s| { -// if (s != unpacked_bytes.len) -// return error.CorruptInput; -// } -// -// // Block Padding -// while (block_counter.bytes_read % 4 != 0) { -// if (try block_reader.readByte() != 0) -// return error.CorruptInput; -// } -// -// switch (self.check) { -// .none => {}, -// .crc32 => { -// const hash_a = Crc32.hash(unpacked_bytes); -// const hash_b = try self.inner_reader.readInt(u32, .little); -// if (hash_a != hash_b) -// return error.WrongChecksum; -// }, -// .crc64 => { -// const hash_a = Crc64.hash(unpacked_bytes); -// const hash_b = try self.inner_reader.readInt(u64, .little); -// if (hash_a != hash_b) -// return error.WrongChecksum; -// }, -// .sha256 => { -// var hash_a: [Sha256.digest_length]u8 = undefined; -// Sha256.hash(unpacked_bytes, &hash_a, .{}); -// -// var hash_b: [Sha256.digest_length]u8 = undefined; -// try self.inner_reader.readNoEof(&hash_b); -// -// if (!std.mem.eql(u8, &hash_a, &hash_b)) -// return error.WrongChecksum; -// }, -// else => return error.Unsupported, -// } -// -// self.block_count += 1; -//} + var allocating = Writer.Allocating.initOwnedSlice(gpa, r.buffer); + allocating.writer.end = r.end; + defer { + r.buffer = allocating.writer.buffer; + r.end = allocating.writer.end; + } + + if (d.block_count == std.math.maxInt(usize)) return error.EndOfStream; + + readBlock(input, &allocating) catch |err| switch (err) { + error.WriteFailed => { + d.err = error.OutOfMemory; + return error.ReadFailed; + }, + error.SuccessfulEndOfStream => { + finish(d); + d.block_count = std.math.maxInt(usize); + return error.EndOfStream; + }, + else => |e| { + d.err = e; + return error.ReadFailed; + }, + }; + switch (d.check) { + .none => {}, + .crc32 => { + const declared_checksum = try input.takeInt(u32, .little); + // TODO + //const hash_a = Crc32.hash(unpacked_bytes); + //if (hash_a != hash_b) return error.WrongChecksum; + _ = declared_checksum; + }, + .crc64 => { + const declared_checksum = try input.takeInt(u64, .little); + // TODO + //const hash_a = Crc64.hash(unpacked_bytes); + //if (hash_a != hash_b) return error.WrongChecksum; + _ = declared_checksum; + }, + .sha256 => { + const declared_hash = try input.take(Sha256.digest_length); + // TODO + //var hash_a: [Sha256.digest_length]u8 = undefined; + //Sha256.hash(unpacked_bytes, &hash_a, .{}); + //if (!std.mem.eql(u8, &hash_a, &hash_b)) + // return error.WrongChecksum; + _ = declared_hash; + }, + else => { + d.err = error.Unsupported; + return error.ReadFailed; + }, + } + d.block_count += 1; + return 0; +} + +fn readBlock(input: *Reader, allocating: *Writer.Allocating) !void { + var packed_size: ?u64 = null; + var unpacked_size: ?u64 = null; + + { + // Read the block header via peeking so that we can hash the whole thing too. + const first_byte: usize = try input.peekByte(); + if (first_byte == 0) return error.SuccessfulEndOfStream; + + const declared_header_size = first_byte * 4; + try input.fill(declared_header_size); + const header_seek_start = input.seek; + input.toss(1); + + const Flags = packed struct(u8) { + last_filter_index: u2, + reserved: u4, + has_packed_size: bool, + has_unpacked_size: bool, + }; + const flags = try input.takeStruct(Flags, .little); + + const filter_count = @as(u3, flags.last_filter_index) + 1; + if (filter_count > 1) return error.Unsupported; + + if (flags.has_packed_size) packed_size = try input.takeLeb128(u64); + if (flags.has_unpacked_size) unpacked_size = try input.takeLeb128(u64); + + const FilterId = enum(u64) { + lzma2 = 0x21, + _, + }; + + const filter_id: FilterId = @enumFromInt(try input.takeLeb128(u64)); + if (filter_id != .lzma2) return error.Unsupported; + + const properties_size = try input.takeLeb128(u64); + if (properties_size != 1) return error.CorruptInput; + // TODO: use filter properties + _ = try input.takeByte(); + + const actual_header_size = input.seek - header_seek_start; + if (actual_header_size > declared_header_size) return error.CorruptInput; + var remaining_bytes = declared_header_size - actual_header_size; + while (remaining_bytes != 0) { + if (try input.takeByte() != 0) return error.CorruptInput; + remaining_bytes -= 1; + } + + const header_slice = input.buffer[header_seek_start..][0..declared_header_size]; + const actual_hash = Crc32.hash(header_slice); + const declared_hash = try input.takeInt(u32, .little); + if (actual_hash != declared_hash) return error.WrongChecksum; + } + + // Compressed Data + + var lzma2_decode = try lzma2.Decode.init(allocating.allocator); + const before_size = allocating.writer.end; + try lzma2_decode.decompress(input, allocating); + const unpacked_bytes = allocating.writer.end - before_size; + + // TODO restore this check + //if (packed_size) |s| { + // if (s != packed_counter.bytes_read) + // return error.CorruptInput; + //} + + if (unpacked_size) |s| { + if (s != unpacked_bytes) return error.CorruptInput; + } + + // Block Padding + if (true) @panic("TODO account for block padding"); + //while (block_counter.bytes_read % 4 != 0) { + // if (try block_reader.takeByte() != 0) + // return error.CorruptInput; + //} + +} + +fn finish(d: *Decompress) void { + _ = d; + @panic("TODO"); + //const input = d.input; + //const index_size = blk: { + // const record_count = try input.takeLeb128(u64); + // if (record_count != d.block_decode.block_count) + // return error.CorruptInput; + + // var i: usize = 0; + // while (i < record_count) : (i += 1) { + // // TODO: validate records + // _ = try std.leb.readUleb128(u64, counting_reader); + // _ = try std.leb.readUleb128(u64, counting_reader); + // } + + // while (counter.bytes_read % 4 != 0) { + // if (try counting_reader.takeByte() != 0) + // return error.CorruptInput; + // } + + // const hash_a = hasher.hasher.final(); + // const hash_b = try counting_reader.takeInt(u32, .little); + // if (hash_a != hash_b) + // return error.WrongChecksum; + + // break :blk counter.bytes_read; + //}; + + //const hash_a = try d.in_reader.takeInt(u32, .little); + + //const hash_b = blk: { + // var hasher = hashedReader(d.in_reader, Crc32.init()); + // const hashed_reader = hasher.reader(); + + // const backward_size = (@as(u64, try hashed_reader.takeInt(u32, .little)) + 1) * 4; + // if (backward_size != index_size) + // return error.CorruptInput; + + // var check: Check = undefined; + // try readStreamFlags(hashed_reader, &check); + + // break :blk hasher.hasher.final(); + //}; + + //if (hash_a != hash_b) + // return error.WrongChecksum; + + //const magic = try d.in_reader.takeBytesNoEof(2); + //if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' })) + // return error.CorruptInput; + + //return 0; +} diff --git a/lib/std/compress/xz/test.zig b/lib/std/compress/xz/test.zig index 08180e45c0..a25cc08df0 100644 --- a/lib/std/compress/xz/test.zig +++ b/lib/std/compress/xz/test.zig @@ -3,19 +3,23 @@ const testing = std.testing; const xz = std.compress.xz; fn decompress(data: []const u8) ![]u8 { - var in_stream = std.io.fixedBufferStream(data); + const gpa = testing.allocator; - var xz_stream = try xz.decompress(testing.allocator, in_stream.reader()); + var in_stream: std.Io.Reader = .fixed(data); + + var xz_stream = try xz.Decompress.init(&in_stream, gpa, &.{}); defer xz_stream.deinit(); - return xz_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize)); + return xz_stream.reader.allocRemaining(gpa, .unlimited); } fn testReader(data: []const u8, comptime expected: []const u8) !void { - const buf = try decompress(data); - defer testing.allocator.free(buf); + const gpa = testing.allocator; - try testing.expectEqualSlices(u8, expected, buf); + const result = try decompress(data); + defer gpa.free(result); + + try testing.expectEqualSlices(u8, expected, result); } test "compressed data" { From 722e066173f5235b1dd1b341acb5cb1e79002ed7 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 26 Aug 2025 20:33:45 -0700 Subject: [PATCH 08/12] std.compress.xz.Decompress: some tests passing --- lib/std/compress/lzma.zig | 99 +++++++++++--------- lib/std/compress/lzma2.zig | 42 +++++---- lib/std/compress/xz/Decompress.zig | 143 +++++++++++++++-------------- lib/std/compress/xz/test.zig | 106 +++++++++++++-------- 4 files changed, 225 insertions(+), 165 deletions(-) diff --git a/lib/std/compress/lzma.zig b/lib/std/compress/lzma.zig index ab7e217871..7a586298be 100644 --- a/lib/std/compress/lzma.zig +++ b/lib/std/compress/lzma.zig @@ -12,11 +12,19 @@ pub const RangeDecoder = struct { code: u32, pub fn init(reader: *Reader) !RangeDecoder { + var counter: u64 = 0; + return initCounting(reader, &counter); + } + + pub fn initCounting(reader: *Reader, n_read: *u64) !RangeDecoder { const reserved = try reader.takeByte(); + n_read.* += 1; if (reserved != 0) return error.InvalidRangeCode; + const code = try reader.takeInt(u32, .big); + n_read.* += 4; return .{ .range = 0xFFFF_FFFF, - .code = try reader.takeInt(u32, .big), + .code = code, }; } @@ -24,47 +32,47 @@ pub const RangeDecoder = struct { return self.code == 0; } - fn normalize(self: *RangeDecoder, reader: *Reader) !void { + fn normalize(self: *RangeDecoder, reader: *Reader, n_read: *u64) !void { if (self.range < 0x0100_0000) { self.range <<= 8; self.code = (self.code << 8) ^ @as(u32, try reader.takeByte()); + n_read.* += 1; } } - fn getBit(self: *RangeDecoder, reader: *Reader) !bool { + fn getBit(self: *RangeDecoder, reader: *Reader, n_read: *u64) !bool { self.range >>= 1; const bit = self.code >= self.range; - if (bit) - self.code -= self.range; + if (bit) self.code -= self.range; - try self.normalize(reader); + try self.normalize(reader, n_read); return bit; } - pub fn get(self: *RangeDecoder, reader: *Reader, count: usize) !u32 { + pub fn get(self: *RangeDecoder, reader: *Reader, count: usize, n_read: *u64) !u32 { var result: u32 = 0; - var i: usize = 0; - while (i < count) : (i += 1) - result = (result << 1) ^ @intFromBool(try self.getBit(reader)); + for (0..count) |_| { + result = (result << 1) ^ @intFromBool(try self.getBit(reader, n_read)); + } return result; } - pub fn decodeBit(self: *RangeDecoder, reader: *Reader, prob: *u16) !bool { + pub fn decodeBit(self: *RangeDecoder, reader: *Reader, prob: *u16, n_read: *u64) !bool { const bound = (self.range >> 11) * prob.*; if (self.code < bound) { prob.* += (0x800 - prob.*) >> 5; self.range = bound; - try self.normalize(reader); + try self.normalize(reader, n_read); return false; } else { prob.* -= prob.* >> 5; self.code -= bound; self.range -= bound; - try self.normalize(reader); + try self.normalize(reader, n_read); return true; } } @@ -74,11 +82,12 @@ pub const RangeDecoder = struct { reader: *Reader, num_bits: u5, probs: []u16, + n_read: *u64, ) !u32 { var tmp: u32 = 1; var i: @TypeOf(num_bits) = 0; while (i < num_bits) : (i += 1) { - const bit = try self.decodeBit(reader, &probs[tmp]); + const bit = try self.decodeBit(reader, &probs[tmp], n_read); tmp = (tmp << 1) ^ @intFromBool(bit); } return tmp - (@as(u32, 1) << num_bits); @@ -90,12 +99,13 @@ pub const RangeDecoder = struct { num_bits: u5, probs: []u16, offset: usize, + n_read: *u64, ) !u32 { var result: u32 = 0; var tmp: usize = 1; var i: @TypeOf(num_bits) = 0; while (i < num_bits) : (i += 1) { - const bit = @intFromBool(try self.decodeBit(reader, &probs[offset + tmp])); + const bit = @intFromBool(try self.decodeBit(reader, &probs[offset + tmp], n_read)); tmp = (tmp << 1) ^ bit; result ^= @as(u32, bit) << i; } @@ -177,13 +187,14 @@ pub const Decode = struct { /// `CircularBuffer` or `std.compress.lzma2.AccumBuffer`. buffer: anytype, decoder: *RangeDecoder, + n_read: *u64, ) !ProcessingStatus { const gpa = allocating.allocator; const writer = &allocating.writer; const pos_state = buffer.len & ((@as(usize, 1) << self.properties.pb) - 1); - if (!try decoder.decodeBit(reader, &self.is_match[(self.state << 4) + pos_state])) { - const byte: u8 = try self.decodeLiteral(reader, buffer, decoder); + if (!try decoder.decodeBit(reader, &self.is_match[(self.state << 4) + pos_state], n_read)) { + const byte: u8 = try self.decodeLiteral(reader, buffer, decoder, n_read); try buffer.appendLiteral(gpa, byte, writer); @@ -197,18 +208,18 @@ pub const Decode = struct { } var len: usize = undefined; - if (try decoder.decodeBit(reader, &self.is_rep[self.state])) { - if (!try decoder.decodeBit(reader, &self.is_rep_g0[self.state])) { - if (!try decoder.decodeBit(reader, &self.is_rep_0long[(self.state << 4) + pos_state])) { + if (try decoder.decodeBit(reader, &self.is_rep[self.state], n_read)) { + if (!try decoder.decodeBit(reader, &self.is_rep_g0[self.state], n_read)) { + if (!try decoder.decodeBit(reader, &self.is_rep_0long[(self.state << 4) + pos_state], n_read)) { self.state = if (self.state < 7) 9 else 11; const dist = self.rep[0] + 1; try buffer.appendLz(gpa, 1, dist, writer); return .more; } } else { - const idx: usize = if (!try decoder.decodeBit(reader, &self.is_rep_g1[self.state])) + const idx: usize = if (!try decoder.decodeBit(reader, &self.is_rep_g1[self.state], n_read)) 1 - else if (!try decoder.decodeBit(reader, &self.is_rep_g2[self.state])) + else if (!try decoder.decodeBit(reader, &self.is_rep_g2[self.state], n_read)) 2 else 3; @@ -220,7 +231,7 @@ pub const Decode = struct { self.rep[0] = dist; } - len = try self.rep_len_decoder.decode(reader, decoder, pos_state); + len = try self.rep_len_decoder.decode(reader, decoder, pos_state, n_read); self.state = if (self.state < 7) 8 else 11; } else { @@ -228,11 +239,11 @@ pub const Decode = struct { self.rep[2] = self.rep[1]; self.rep[1] = self.rep[0]; - len = try self.len_decoder.decode(reader, decoder, pos_state); + len = try self.len_decoder.decode(reader, decoder, pos_state, n_read); self.state = if (self.state < 7) 7 else 10; - const rep_0 = try self.decodeDistance(reader, decoder, len); + const rep_0 = try self.decodeDistance(reader, decoder, len, n_read); self.rep[0] = rep_0; if (self.rep[0] == 0xFFFF_FFFF) { @@ -257,6 +268,7 @@ pub const Decode = struct { /// `CircularBuffer` or `std.compress.lzma2.AccumBuffer`. buffer: anytype, decoder: *RangeDecoder, + n_read: *u64, ) !u8 { const def_prev_byte = 0; const prev_byte = @as(usize, buffer.lastOr(def_prev_byte)); @@ -275,6 +287,7 @@ pub const Decode = struct { const bit = @intFromBool(try decoder.decodeBit( reader, &probs[((@as(usize, 1) + match_bit) << 8) + result], + n_read, )); result = (result << 1) ^ bit; if (match_bit != bit) { @@ -284,10 +297,10 @@ pub const Decode = struct { } while (result < 0x100) { - result = (result << 1) ^ @intFromBool(try decoder.decodeBit(reader, &probs[result])); + result = (result << 1) ^ @intFromBool(try decoder.decodeBit(reader, &probs[result], n_read)); } - return @as(u8, @truncate(result - 0x100)); + return @truncate(result - 0x100); } fn decodeDistance( @@ -295,12 +308,12 @@ pub const Decode = struct { reader: *Reader, decoder: *RangeDecoder, length: usize, + n_read: *u64, ) !usize { const len_state = if (length > 3) 3 else length; - const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(reader, decoder)); - if (pos_slot < 4) - return pos_slot; + const pos_slot: usize = try self.pos_slot_decoder[len_state].parse(reader, decoder, n_read); + if (pos_slot < 4) return pos_slot; const num_direct_bits = @as(u5, @intCast((pos_slot >> 1) - 1)); var result = (2 ^ (pos_slot & 1)) << num_direct_bits; @@ -311,10 +324,11 @@ pub const Decode = struct { num_direct_bits, &self.pos_decoders, result - pos_slot, + n_read, ); } else { - result += @as(usize, try decoder.get(reader, num_direct_bits - 4)) << 4; - result += try self.align_decoder.parseReverse(reader, decoder); + result += @as(usize, try decoder.get(reader, num_direct_bits - 4, n_read)) << 4; + result += try self.align_decoder.parseReverse(reader, decoder, n_read); } return result; @@ -435,16 +449,17 @@ pub const Decode = struct { return struct { probs: [1 << num_bits]u16 = @splat(0x400), - pub fn parse(self: *@This(), reader: *Reader, decoder: *RangeDecoder) !u32 { - return decoder.parseBitTree(reader, num_bits, &self.probs); + pub fn parse(self: *@This(), reader: *Reader, decoder: *RangeDecoder, n_read: *u64) !u32 { + return decoder.parseBitTree(reader, num_bits, &self.probs, n_read); } pub fn parseReverse( self: *@This(), reader: *Reader, decoder: *RangeDecoder, + n_read: *u64, ) !u32 { - return decoder.parseReverseBitTree(reader, num_bits, &self.probs, 0); + return decoder.parseReverseBitTree(reader, num_bits, &self.probs, 0, n_read); } pub fn reset(self: *@This()) void { @@ -465,13 +480,14 @@ pub const Decode = struct { reader: *Reader, decoder: *RangeDecoder, pos_state: usize, + n_read: *u64, ) !usize { - if (!try decoder.decodeBit(reader, &self.choice)) { - return @as(usize, try self.low_coder[pos_state].parse(reader, decoder)); - } else if (!try decoder.decodeBit(reader, &self.choice2)) { - return @as(usize, try self.mid_coder[pos_state].parse(reader, decoder)) + 8; + if (!try decoder.decodeBit(reader, &self.choice, n_read)) { + return @as(usize, try self.low_coder[pos_state].parse(reader, decoder, n_read)); + } else if (!try decoder.decodeBit(reader, &self.choice2, n_read)) { + return @as(usize, try self.mid_coder[pos_state].parse(reader, decoder, n_read)) + 8; } else { - return @as(usize, try self.high_coder.parse(reader, decoder)) + 16; + return @as(usize, try self.high_coder.parse(reader, decoder, n_read)) + 16; } } @@ -701,7 +717,8 @@ pub const Decompress = struct { } else if (d.range_decoder.isFinished()) { break :process_next; } - switch (d.decode.process(d.input, &allocating, &d.buffer, &d.range_decoder) catch |err| switch (err) { + var n_read: u64 = 0; + switch (d.decode.process(d.input, &allocating, &d.buffer, &d.range_decoder, &n_read) catch |err| switch (err) { error.WriteFailed => { d.err = error.OutOfMemory; return error.ReadFailed; diff --git a/lib/std/compress/lzma2.zig b/lib/std/compress/lzma2.zig index 6aad2e1198..e99b2fe091 100644 --- a/lib/std/compress/lzma2.zig +++ b/lib/std/compress/lzma2.zig @@ -116,24 +116,29 @@ pub const Decode = struct { self.* = undefined; } - pub fn decompress(d: *Decode, reader: *Reader, allocating: *Writer.Allocating) !void { + /// Returns how many compressed bytes were consumed. + pub fn decompress(d: *Decode, reader: *Reader, allocating: *Writer.Allocating) !u64 { const gpa = allocating.allocator; var accum = AccumBuffer.init(std.math.maxInt(usize)); defer accum.deinit(gpa); + var n_read: u64 = 0; + while (true) { const status = try reader.takeByte(); + n_read += 1; switch (status) { 0 => break, - 1 => try parseUncompressed(reader, allocating, &accum, true), - 2 => try parseUncompressed(reader, allocating, &accum, false), - else => try d.parseLzma(reader, allocating, &accum, status), + 1 => n_read += try parseUncompressed(reader, allocating, &accum, true), + 2 => n_read += try parseUncompressed(reader, allocating, &accum, false), + else => n_read += try d.parseLzma(reader, allocating, &accum, status), } } try accum.finish(&allocating.writer); + return n_read; } fn parseLzma( @@ -142,7 +147,7 @@ pub const Decode = struct { allocating: *Writer.Allocating, accum: *AccumBuffer, status: u8, - ) !void { + ) !u64 { if (status & 0x80 == 0) return error.CorruptInput; const Reset = struct { @@ -175,15 +180,19 @@ pub const Decode = struct { else => unreachable, }; + var n_read: u64 = 0; + const unpacked_size = blk: { var tmp: u64 = status & 0x1F; tmp <<= 16; tmp |= try reader.takeInt(u16, .big); + n_read += 2; break :blk tmp + 1; }; const packed_size = blk: { const tmp: u17 = try reader.takeInt(u16, .big); + n_read += 2; break :blk tmp + 1; }; @@ -196,6 +205,7 @@ pub const Decode = struct { if (reset.props) { var props = try reader.takeByte(); + n_read += 1; if (props >= 225) { return error.CorruptInput; } @@ -216,23 +226,21 @@ pub const Decode = struct { try ld.resetState(allocating.allocator, new_props); } - var range_decoder = try lzma.RangeDecoder.init(reader); + const start_count = n_read; + var range_decoder = try lzma.RangeDecoder.initCounting(reader, &n_read); while (true) { if (accum.len >= unpacked_size) break; if (range_decoder.isFinished()) break; - switch (try ld.process(reader, allocating, accum, &range_decoder)) { + switch (try ld.process(reader, allocating, accum, &range_decoder, &n_read)) { .more => continue, .finished => break, } } if (accum.len != unpacked_size) return error.DecompressedSizeMismatch; + if (n_read - start_count != packed_size) return error.CompressedSizeMismatch; - // TODO restore this error - //if (counter.bytes_read != packed_size) { - // return error.CorruptInput; - //} - _ = packed_size; + return n_read; } fn parseUncompressed( @@ -240,18 +248,17 @@ pub const Decode = struct { allocating: *Writer.Allocating, accum: *AccumBuffer, reset_dict: bool, - ) !void { + ) !usize { const unpacked_size = @as(u17, try reader.takeInt(u16, .big)) + 1; if (reset_dict) try accum.reset(&allocating.writer); const gpa = allocating.allocator; - var i = unpacked_size; - while (i != 0) { + for (0..unpacked_size) |_| { try accum.appendByte(gpa, try reader.takeByte()); - i -= 1; } + return 2 + unpacked_size; } }; @@ -268,6 +275,7 @@ test "decompress hello world stream" { var result: std.Io.Writer.Allocating = .init(gpa); defer result.deinit(); - try decode.decompress(&stream, &result); + const n_read = try decode.decompress(&stream, &result); + try std.testing.expectEqual(compressed.len, n_read); try std.testing.expectEqualStrings(expected, result.written()); } diff --git a/lib/std/compress/xz/Decompress.zig b/lib/std/compress/xz/Decompress.zig index 99cfa266b1..eaebe05c23 100644 --- a/lib/std/compress/xz/Decompress.zig +++ b/lib/std/compress/xz/Decompress.zig @@ -8,6 +8,7 @@ const Sha256 = std.crypto.hash.sha2.Sha256; const lzma2 = std.compress.lzma2; const Writer = std.Io.Writer; const Reader = std.Io.Reader; +const assert = std.debug.assert; /// Underlying compressed data stream to pull bytes from. input: *Reader, @@ -28,6 +29,7 @@ pub const Error = error{ Overflow, InvalidRangeCode, DecompressedSizeMismatch, + CompressedSizeMismatch, }; pub const Check = enum(u4) { @@ -62,10 +64,10 @@ pub fn init( if (!std.mem.eql(u8, magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 })) return error.NotXzStream; - const actual_hash = Crc32.hash(try input.peek(@sizeOf(StreamFlags))); + const computed_checksum = Crc32.hash(try input.peek(@sizeOf(StreamFlags))); const stream_flags = input.takeStruct(StreamFlags, .little) catch unreachable; const stored_hash = try input.takeInt(u32, .little); - if (actual_hash != stored_hash) return error.WrongChecksum; + if (computed_checksum != stored_hash) return error.WrongChecksum; return .{ .input = input, @@ -129,6 +131,7 @@ fn readIndirect(r: *Reader) Reader.Error!usize { r.end = allocating.writer.end; } + if (d.err != null) return error.ReadFailed; if (d.block_count == std.math.maxInt(usize)) return error.EndOfStream; readBlock(input, &allocating) catch |err| switch (err) { @@ -137,7 +140,10 @@ fn readIndirect(r: *Reader) Reader.Error!usize { return error.ReadFailed; }, error.SuccessfulEndOfStream => { - finish(d); + finish(d) catch |finish_err| { + d.err = finish_err; + return error.ReadFailed; + }; d.block_count = std.math.maxInt(usize); return error.EndOfStream; }, @@ -184,7 +190,7 @@ fn readBlock(input: *Reader, allocating: *Writer.Allocating) !void { var packed_size: ?u64 = null; var unpacked_size: ?u64 = null; - { + const header_size = h: { // Read the block header via peeking so that we can hash the whole thing too. const first_byte: usize = try input.peekByte(); if (first_byte == 0) return error.SuccessfulEndOfStream; @@ -223,95 +229,92 @@ fn readBlock(input: *Reader, allocating: *Writer.Allocating) !void { const actual_header_size = input.seek - header_seek_start; if (actual_header_size > declared_header_size) return error.CorruptInput; - var remaining_bytes = declared_header_size - actual_header_size; - while (remaining_bytes != 0) { + const remaining_bytes = declared_header_size - actual_header_size; + for (0..remaining_bytes) |_| { if (try input.takeByte() != 0) return error.CorruptInput; - remaining_bytes -= 1; } const header_slice = input.buffer[header_seek_start..][0..declared_header_size]; - const actual_hash = Crc32.hash(header_slice); - const declared_hash = try input.takeInt(u32, .little); - if (actual_hash != declared_hash) return error.WrongChecksum; - } + const computed_checksum = Crc32.hash(header_slice); + const declared_checksum = try input.takeInt(u32, .little); + if (computed_checksum != declared_checksum) return error.WrongChecksum; + break :h declared_header_size; + }; // Compressed Data var lzma2_decode = try lzma2.Decode.init(allocating.allocator); + defer lzma2_decode.deinit(allocating.allocator); const before_size = allocating.writer.end; - try lzma2_decode.decompress(input, allocating); + const packed_bytes_read = try lzma2_decode.decompress(input, allocating); const unpacked_bytes = allocating.writer.end - before_size; - // TODO restore this check - //if (packed_size) |s| { - // if (s != packed_counter.bytes_read) - // return error.CorruptInput; - //} + if (packed_size) |s| { + if (s != packed_bytes_read) return error.CorruptInput; + } if (unpacked_size) |s| { if (s != unpacked_bytes) return error.CorruptInput; } // Block Padding - if (true) @panic("TODO account for block padding"); - //while (block_counter.bytes_read % 4 != 0) { - // if (try block_reader.takeByte() != 0) - // return error.CorruptInput; - //} - + const block_counter = header_size + packed_bytes_read; + const padding = (4 - (block_counter % 4)) % 4; + for (0..padding) |_| { + if (try input.takeByte() != 0) return error.CorruptInput; + } } -fn finish(d: *Decompress) void { - _ = d; - @panic("TODO"); - //const input = d.input; - //const index_size = blk: { - // const record_count = try input.takeLeb128(u64); - // if (record_count != d.block_decode.block_count) - // return error.CorruptInput; +fn finish(d: *Decompress) !void { + const input = d.input; + const index_size = blk: { + // Assume that we already peeked a zero in readBlock(). + assert(input.buffered()[0] == 0); + var input_counter: u64 = 1; + var checksum: Crc32 = .init(); + checksum.update(&.{0}); + input.toss(1); - // var i: usize = 0; - // while (i < record_count) : (i += 1) { - // // TODO: validate records - // _ = try std.leb.readUleb128(u64, counting_reader); - // _ = try std.leb.readUleb128(u64, counting_reader); - // } + const record_count = try countLeb128(input, u64, &input_counter, &checksum); + if (record_count != d.block_count) + return error.CorruptInput; - // while (counter.bytes_read % 4 != 0) { - // if (try counting_reader.takeByte() != 0) - // return error.CorruptInput; - // } + for (0..record_count) |_| { + // TODO: validate records + _ = try countLeb128(input, u64, &input_counter, &checksum); + _ = try countLeb128(input, u64, &input_counter, &checksum); + } - // const hash_a = hasher.hasher.final(); - // const hash_b = try counting_reader.takeInt(u32, .little); - // if (hash_a != hash_b) - // return error.WrongChecksum; + const padding_len = (4 - (input_counter % 4)) % 4; + const padding = try input.take(padding_len); + for (padding) |byte| { + if (byte != 0) return error.CorruptInput; + } + checksum.update(padding); - // break :blk counter.bytes_read; - //}; + const declared_checksum = try input.takeInt(u32, .little); + const computed_checksum = checksum.final(); + if (computed_checksum != declared_checksum) return error.WrongChecksum; - //const hash_a = try d.in_reader.takeInt(u32, .little); + break :blk input_counter + padding.len + 4; + }; - //const hash_b = blk: { - // var hasher = hashedReader(d.in_reader, Crc32.init()); - // const hashed_reader = hasher.reader(); - - // const backward_size = (@as(u64, try hashed_reader.takeInt(u32, .little)) + 1) * 4; - // if (backward_size != index_size) - // return error.CorruptInput; - - // var check: Check = undefined; - // try readStreamFlags(hashed_reader, &check); - - // break :blk hasher.hasher.final(); - //}; - - //if (hash_a != hash_b) - // return error.WrongChecksum; - - //const magic = try d.in_reader.takeBytesNoEof(2); - //if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' })) - // return error.CorruptInput; - - //return 0; + const declared_checksum = try input.takeInt(u32, .little); + const computed_checksum = Crc32.hash(try input.peek(4 + @sizeOf(StreamFlags))); + if (declared_checksum != computed_checksum) return error.WrongChecksum; + const backward_size = (@as(u64, try input.takeInt(u32, .little)) + 1) * 4; + if (backward_size != index_size) return error.CorruptInput; + input.toss(@sizeOf(StreamFlags)); + if (!std.mem.eql(u8, try input.takeArray(2), &.{ 'Y', 'Z' })) + return error.CorruptInput; +} + +fn countLeb128(reader: *Reader, comptime T: type, counter: *u64, hasher: *Crc32) !T { + try reader.fill(8); + const start = reader.seek; + const result = try reader.takeLeb128(T); + const read_slice = reader.buffer[start..reader.seek]; + hasher.update(read_slice); + counter.* += read_slice.len; + return result; } diff --git a/lib/std/compress/xz/test.zig b/lib/std/compress/xz/test.zig index a25cc08df0..ffd5ca037e 100644 --- a/lib/std/compress/xz/test.zig +++ b/lib/std/compress/xz/test.zig @@ -22,47 +22,79 @@ fn testReader(data: []const u8, comptime expected: []const u8) !void { try testing.expectEqualSlices(u8, expected, result); } -test "compressed data" { +test "fixture good-0-empty.xz" { try testReader(@embedFile("testdata/good-0-empty.xz"), ""); +} - inline for ([_][]const u8{ - "good-1-check-none.xz", - "good-1-check-crc32.xz", - "good-1-check-crc64.xz", - "good-1-check-sha256.xz", - "good-2-lzma2.xz", - "good-1-block_header-1.xz", - "good-1-block_header-2.xz", - "good-1-block_header-3.xz", - }) |filename| { - try testReader(@embedFile("testdata/" ++ filename), - \\Hello - \\World! - \\ - ); - } +const hello_world_text = + \\Hello + \\World! + \\ +; - inline for ([_][]const u8{ - "good-1-lzma2-1.xz", - "good-1-lzma2-2.xz", - "good-1-lzma2-3.xz", - "good-1-lzma2-4.xz", - }) |filename| { - try testReader(@embedFile("testdata/" ++ filename), - \\Lorem ipsum dolor sit amet, consectetur adipisicing - \\elit, sed do eiusmod tempor incididunt ut - \\labore et dolore magna aliqua. Ut enim - \\ad minim veniam, quis nostrud exercitation ullamco - \\laboris nisi ut aliquip ex ea commodo - \\consequat. Duis aute irure dolor in reprehenderit - \\in voluptate velit esse cillum dolore eu - \\fugiat nulla pariatur. Excepteur sint occaecat cupidatat - \\non proident, sunt in culpa qui officia - \\deserunt mollit anim id est laborum. - \\ - ); - } +test "fixture good-1-check-none.xz" { + try testReader(@embedFile("testdata/good-1-check-none.xz"), hello_world_text); +} +test "fixture good-1-check-crc32.xz" { + try testReader(@embedFile("testdata/good-1-check-crc32.xz"), hello_world_text); +} + +test "fixture good-1-check-crc64.xz" { + try testReader(@embedFile("testdata/good-1-check-crc64.xz"), hello_world_text); +} + +test "fixture good-1-check-sha256.xz" { + try testReader(@embedFile("testdata/good-1-check-sha256.xz"), hello_world_text); +} + +test "fixture good-2-lzma2.xz" { + try testReader(@embedFile("testdata/good-2-lzma2.xz"), hello_world_text); +} + +test "fixture good-1-block_header-1.xz" { + try testReader(@embedFile("testdata/good-1-block_header-1.xz"), hello_world_text); +} + +test "fixture good-1-block_header-2.xz" { + try testReader(@embedFile("testdata/good-1-block_header-2.xz"), hello_world_text); +} + +test "fixture good-1-block_header-3.xz" { + try testReader(@embedFile("testdata/good-1-block_header-3.xz"), hello_world_text); +} + +const lorem_ipsum_text = + \\Lorem ipsum dolor sit amet, consectetur adipisicing + \\elit, sed do eiusmod tempor incididunt ut + \\labore et dolore magna aliqua. Ut enim + \\ad minim veniam, quis nostrud exercitation ullamco + \\laboris nisi ut aliquip ex ea commodo + \\consequat. Duis aute irure dolor in reprehenderit + \\in voluptate velit esse cillum dolore eu + \\fugiat nulla pariatur. Excepteur sint occaecat cupidatat + \\non proident, sunt in culpa qui officia + \\deserunt mollit anim id est laborum. + \\ +; + +test "fixture good-1-lzma2-1.xz" { + try testReader(@embedFile("testdata/good-1-lzma2-1.xz"), lorem_ipsum_text); +} + +test "fixture good-1-lzma2-2.xz" { + try testReader(@embedFile("testdata/good-1-lzma2-2.xz"), lorem_ipsum_text); +} + +test "fixture good-1-lzma2-3.xz" { + try testReader(@embedFile("testdata/good-1-lzma2-3.xz"), lorem_ipsum_text); +} + +test "fixture good-1-lzma2-4.xz" { + try testReader(@embedFile("testdata/good-1-lzma2-4.xz"), lorem_ipsum_text); +} + +test "fixture good-1-lzma2-5.xz" { try testReader(@embedFile("testdata/good-1-lzma2-5.xz"), ""); } From 980445f08bfee496f4e784b05653e1698addaac8 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 26 Aug 2025 20:49:37 -0700 Subject: [PATCH 09/12] std.compress.lzma: fix unpacked size checking logic --- lib/std/compress/lzma2.zig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/std/compress/lzma2.zig b/lib/std/compress/lzma2.zig index e99b2fe091..46e06bedab 100644 --- a/lib/std/compress/lzma2.zig +++ b/lib/std/compress/lzma2.zig @@ -226,18 +226,19 @@ pub const Decode = struct { try ld.resetState(allocating.allocator, new_props); } + const expected_unpacked_size = accum.len + unpacked_size; const start_count = n_read; var range_decoder = try lzma.RangeDecoder.initCounting(reader, &n_read); while (true) { - if (accum.len >= unpacked_size) break; + if (accum.len >= expected_unpacked_size) break; if (range_decoder.isFinished()) break; switch (try ld.process(reader, allocating, accum, &range_decoder, &n_read)) { .more => continue, .finished => break, } } - if (accum.len != unpacked_size) return error.DecompressedSizeMismatch; + if (accum.len != expected_unpacked_size) return error.DecompressedSizeMismatch; if (n_read - start_count != packed_size) return error.CompressedSizeMismatch; return n_read; From 668299f0db3a258e45e28c68696e24cfbf8386a3 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 26 Aug 2025 20:56:34 -0700 Subject: [PATCH 10/12] std: update xz unit tests to new I/O API --- lib/std/compress/xz/test.zig | 56 +++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/lib/std/compress/xz/test.zig b/lib/std/compress/xz/test.zig index ffd5ca037e..cdc3a5e652 100644 --- a/lib/std/compress/xz/test.zig +++ b/lib/std/compress/xz/test.zig @@ -13,7 +13,7 @@ fn decompress(data: []const u8) ![]u8 { return xz_stream.reader.allocRemaining(gpa, .unlimited); } -fn testReader(data: []const u8, comptime expected: []const u8) !void { +fn testReader(data: []const u8, expected: []const u8) !void { const gpa = testing.allocator; const result = try decompress(data); @@ -22,6 +22,17 @@ fn testReader(data: []const u8, comptime expected: []const u8) !void { try testing.expectEqualSlices(u8, expected, result); } +fn testDecompressError(expected: anyerror, compressed: []const u8) !void { + const gpa = std.testing.allocator; + var stream: std.Io.Reader = .fixed(compressed); + + var decompressor = try xz.Decompress.init(&stream, gpa, &.{}); + defer decompressor.deinit(); + + try std.testing.expectError(error.ReadFailed, decompressor.reader.allocRemaining(gpa, .unlimited)); + try std.testing.expectEqual(expected, decompressor.err orelse return error.TestFailed); +} + test "fixture good-0-empty.xz" { try testReader(@embedFile("testdata/good-0-empty.xz"), ""); } @@ -98,21 +109,32 @@ test "fixture good-1-lzma2-5.xz" { try testReader(@embedFile("testdata/good-1-lzma2-5.xz"), ""); } -test "unsupported" { - inline for ([_][]const u8{ - "good-1-delta-lzma2.tiff.xz", - "good-1-x86-lzma2.xz", - "good-1-sparc-lzma2.xz", - "good-1-arm64-lzma2-1.xz", - "good-1-arm64-lzma2-2.xz", - "good-1-3delta-lzma2.xz", - "good-1-empty-bcj-lzma2.xz", - }) |filename| { - try testing.expectError( - error.Unsupported, - decompress(@embedFile("testdata/" ++ filename)), - ); - } +test "fixture good-1-delta-lzma2.tiff.xz" { + try testDecompressError(error.Unsupported, @embedFile("testdata/good-1-delta-lzma2.tiff.xz")); +} + +test "fixture good-1-x86-lzma2.xz" { + try testDecompressError(error.Unsupported, @embedFile("testdata/good-1-x86-lzma2.xz")); +} + +test "fixture good-1-sparc-lzma2.xz" { + try testDecompressError(error.Unsupported, @embedFile("testdata/good-1-sparc-lzma2.xz")); +} + +test "fixture good-1-arm64-lzma2-1.xz" { + try testDecompressError(error.Unsupported, @embedFile("testdata/good-1-arm64-lzma2-1.xz")); +} + +test "fixture good-1-arm64-lzma2-2.xz" { + try testDecompressError(error.Unsupported, @embedFile("testdata/good-1-arm64-lzma2-2.xz")); +} + +test "fixture good-1-3delta-lzma2.xz" { + try testDecompressError(error.Unsupported, @embedFile("testdata/good-1-3delta-lzma2.xz")); +} + +test "fixture good-1-empty-bcj-lzma2.xz" { + try testDecompressError(error.Unsupported, @embedFile("testdata/good-1-empty-bcj-lzma2.xz")); } fn testDontPanic(data: []const u8) !void { @@ -127,6 +149,8 @@ test "size fields: integer overflow avoidance" { // These cases were found via fuzz testing and each previously caused // an integer overflow when decoding. We just want to ensure they no longer // cause a panic + // TODO this not a sufficient way to test. tests should always check the result, + // not merely ensure that the code does not crash. const header_size_overflow = "\xfd7zXZ\x00\x00\x01i\"\xde6z"; try testDontPanic(header_size_overflow); const lzma2_chunk_size_overflow = "\xfd7zXZ\x00\x00\x01i\"\xde6\x02\x00!\x01\x08\x00\x00\x00\xd8\x0f#\x13\x01\xff\xff"; From 0339c5793a26cee680d2e42da702038e3574469c Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 26 Aug 2025 21:00:26 -0700 Subject: [PATCH 11/12] fetch: update xz API usage --- src/Package/Fetch.zig | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Package/Fetch.zig b/src/Package/Fetch.zig index bf77f88e93..5972b0bf96 100644 --- a/src/Package/Fetch.zig +++ b/src/Package/Fetch.zig @@ -1204,12 +1204,10 @@ fn unpackResource( }, .@"tar.xz" => { const gpa = f.arena.child_allocator; - var dcp = std.compress.xz.decompress(gpa, resource.reader().adaptToOldInterface()) catch |err| + var decompress = std.compress.xz.Decompress.init(resource.reader(), gpa, &.{}) catch |err| return f.fail(f.location_tok, try eb.printString("unable to decompress tarball: {t}", .{err})); - defer dcp.deinit(); - var adapter_buffer: [1024]u8 = undefined; - var adapter = dcp.reader().adaptToNewApi(&adapter_buffer); - return try unpackTarball(f, tmp_directory.handle, &adapter.new_interface); + defer decompress.deinit(); + return try unpackTarball(f, tmp_directory.handle, &decompress.reader); }, .@"tar.zst" => { const window_len = std.compress.zstd.default_window_len; From 68f590d430bee6bc9b3bb4940f739d8b04435c08 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 26 Aug 2025 21:07:00 -0700 Subject: [PATCH 12/12] std.compress.xz: fix 32-bit targets --- lib/std/compress/xz/Decompress.zig | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/std/compress/xz/Decompress.zig b/lib/std/compress/xz/Decompress.zig index eaebe05c23..2fff561c73 100644 --- a/lib/std/compress/xz/Decompress.zig +++ b/lib/std/compress/xz/Decompress.zig @@ -259,9 +259,9 @@ fn readBlock(input: *Reader, allocating: *Writer.Allocating) !void { // Block Padding const block_counter = header_size + packed_bytes_read; - const padding = (4 - (block_counter % 4)) % 4; - for (0..padding) |_| { - if (try input.takeByte() != 0) return error.CorruptInput; + const padding = try input.take(@intCast((4 - (block_counter % 4)) % 4)); + for (padding) |byte| { + if (byte != 0) return error.CorruptInput; } } @@ -279,14 +279,13 @@ fn finish(d: *Decompress) !void { if (record_count != d.block_count) return error.CorruptInput; - for (0..record_count) |_| { + for (0..@intCast(record_count)) |_| { // TODO: validate records _ = try countLeb128(input, u64, &input_counter, &checksum); _ = try countLeb128(input, u64, &input_counter, &checksum); } - const padding_len = (4 - (input_counter % 4)) % 4; - const padding = try input.take(padding_len); + const padding = try input.take(@intCast((4 - (input_counter % 4)) % 4)); for (padding) |byte| { if (byte != 0) return error.CorruptInput; }