Merge pull request #18261 from ianic/tar_tests

std:tar Copy Go tar test suite and make them pass
2025-12-06 14:23:09 +00:00 · 2024-01-13 18:44:14 -08:00 · 2024-01-13 18:44:14 -08:00 · d55d1e32b6
commit d55d1e32b6
parent 7916cf6f83 3f809cbe7d
31 changed files with 917 additions and 192 deletions
--- a/build.zig
+++ b/build.zig
@ -165,6 +165,8 @@ pub fn build(b: *std.Build) !void {
                ".xz",
                // exclude files from lib/std/tz/
                ".tzif",
+                // exclude files from lib/std/tar/testdata
+                ".tar",
                // others
                "README.md",
            },
--- a/lib/std/tar.zig
+++ b/lib/std/tar.zig
@ -1,3 +1,23 @@
+/// Tar archive is single ordinary file which can contain many files (or
+/// directories, symlinks, ...). It's build by series of blocks each size of 512
+/// bytes. First block of each entry is header which defines type, name, size
+/// permissions and other attributes. Header is followed by series of blocks of
+/// file content, if any that entry has content. Content is padded to the block
+/// size, so next header always starts at block boundary.
+///
+/// This simple format is extended by GNU and POSIX pax extensions to support
+/// file names longer than 256 bytes and additional attributes.
+///
+/// This is not comprehensive tar parser. Here we are only file types needed to
+/// support Zig package manager; normal file, directory, symbolic link. And
+/// subset of attributes: name, size, permissions.
+///
+/// GNU tar reference: https://www.gnu.org/software/tar/manual/html_node/Standard.html
+/// pax reference: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/pax.html#tag_20_92_13
+///
+const std = @import("std.zig");
+const assert = std.debug.assert;
+
 pub const Options = struct {
    /// Number of directory levels to skip when extracting files.
    strip_components: u32 = 0,
@ -37,7 +57,7 @@ pub const Options = struct {
            },
            unsupported_file_type: struct {
                file_name: []const u8,
-                file_type: Header.FileType,
+                file_type: Header.Kind,
            },
        };

@ -63,9 +83,13 @@ pub const Options = struct {
 };

 pub const Header = struct {
-    bytes: *const [512]u8,
+    const SIZE = 512;
+    const MAX_NAME_SIZE = 100 + 1 + 155; // name(100) + separator(1) + prefix(155)
+    const LINK_NAME_SIZE = 100;

-    pub const FileType = enum(u8) {
+    bytes: *const [SIZE]u8,
+
+    pub const Kind = enum(u8) {
        normal_alias = 0,
        normal = '0',
        hard_link = '1',
@ -77,102 +101,423 @@ pub const Header = struct {
        contiguous = '7',
        global_extended_header = 'g',
        extended_header = 'x',
+        // Types 'L' and 'K' are used by the GNU format for a meta file
+        // used to store the path or link name for the next file.
+        gnu_long_name = 'L',
+        gnu_long_link = 'K',
+        gnu_sparse = 'S',
+        solaris_extended_header = 'X',
        _,
    };

-    pub fn fileSize(header: Header) !u64 {
-        const raw = header.bytes[124..][0..12];
-        const ltrimmed = std.mem.trimLeft(u8, raw, "0 ");
-        const rtrimmed = std.mem.trimRight(u8, ltrimmed, " \x00");
-        if (rtrimmed.len == 0) return 0;
-        return std.fmt.parseInt(u64, rtrimmed, 8);
-    }
-
-    pub fn is_ustar(header: Header) bool {
-        return std.mem.eql(u8, header.bytes[257..][0..6], "ustar\x00");
-    }
-
    /// Includes prefix concatenated, if any.
-    /// Return value may point into Header buffer, or might point into the
-    /// argument buffer.
    /// TODO: check against "../" and other nefarious things
-    pub fn fullFileName(header: Header, buffer: *[std.fs.MAX_PATH_BYTES]u8) ![]const u8 {
+    pub fn fullName(header: Header, buffer: *[MAX_NAME_SIZE]u8) ![]const u8 {
        const n = name(header);
-        if (!is_ustar(header))
-            return n;
        const p = prefix(header);
-        if (p.len == 0)
-            return n;
+        if (!is_ustar(header) or p.len == 0) {
+            @memcpy(buffer[0..n.len], n);
+            return buffer[0..n.len];
+        }
        @memcpy(buffer[0..p.len], p);
        buffer[p.len] = '/';
        @memcpy(buffer[p.len + 1 ..][0..n.len], n);
        return buffer[0 .. p.len + 1 + n.len];
    }

-    pub fn name(header: Header) []const u8 {
-        return str(header, 0, 0 + 100);
+    pub fn linkName(header: Header, buffer: *[LINK_NAME_SIZE]u8) []const u8 {
+        const link_name = header.str(157, 100);
+        if (link_name.len == 0) {
+            return buffer[0..0];
+        }
+        const buf = buffer[0..link_name.len];
+        @memcpy(buf, link_name);
+        return buf;
    }

-    pub fn linkName(header: Header) []const u8 {
-        return str(header, 157, 157 + 100);
+    pub fn name(header: Header) []const u8 {
+        return header.str(0, 100);
+    }
+
+    pub fn mode(header: Header) !u32 {
+        return @intCast(try header.numeric(100, 8));
+    }
+
+    pub fn size(header: Header) !u64 {
+        return header.numeric(124, 12);
+    }
+
+    pub fn chksum(header: Header) !u64 {
+        return header.octal(148, 8);
+    }
+
+    pub fn is_ustar(header: Header) bool {
+        const magic = header.bytes[257..][0..6];
+        return std.mem.eql(u8, magic[0..5], "ustar") and (magic[5] == 0 or magic[5] == ' ');
    }

    pub fn prefix(header: Header) []const u8 {
-        return str(header, 345, 345 + 155);
+        return header.str(345, 155);
    }

-    pub fn fileType(header: Header) FileType {
-        const result: FileType = @enumFromInt(header.bytes[156]);
+    pub fn kind(header: Header) Kind {
+        const result: Kind = @enumFromInt(header.bytes[156]);
        if (result == .normal_alias) return .normal;
        return result;
    }

-    fn str(header: Header, start: usize, end: usize) []const u8 {
-        var i: usize = start;
-        while (i < end) : (i += 1) {
-            if (header.bytes[i] == 0) break;
+    fn str(header: Header, start: usize, len: usize) []const u8 {
+        return nullStr(header.bytes[start .. start + len]);
    }
-        return header.bytes[start..i];
+
+    fn numeric(header: Header, start: usize, len: usize) !u64 {
+        const raw = header.bytes[start..][0..len];
+        //  If the leading byte is 0xff (255), all the bytes of the field
+        //  (including the leading byte) are concatenated in big-endian order,
+        //  with the result being a negative number expressed in two’s
+        //  complement form.
+        if (raw[0] == 0xff) return error.TarNumericValueNegative;
+        // If the leading byte is 0x80 (128), the non-leading bytes of the
+        // field are concatenated in big-endian order.
+        if (raw[0] == 0x80) {
+            if (raw[1] + raw[2] + raw[3] != 0) return error.TarNumericValueTooBig;
+            return std.mem.readInt(u64, raw[4..12], .big);
+        }
+        return try header.octal(start, len);
+    }
+
+    fn octal(header: Header, start: usize, len: usize) !u64 {
+        const raw = header.bytes[start..][0..len];
+        // Zero-filled octal number in ASCII. Each numeric field of width w
+        // contains w minus 1 digits, and a null
+        const ltrimmed = std.mem.trimLeft(u8, raw, "0 ");
+        const rtrimmed = std.mem.trimRight(u8, ltrimmed, " \x00");
+        if (rtrimmed.len == 0) return 0;
+        return std.fmt.parseInt(u64, rtrimmed, 8) catch return error.TarHeader;
+    }
+
+    const Chksums = struct {
+        unsigned: u64,
+        signed: i64,
+    };
+
+    // Sum of all bytes in the header block. The chksum field is treated as if
+    // it were filled with spaces (ASCII 32).
+    fn computeChksum(header: Header) Chksums {
+        var cs: Chksums = .{ .signed = 0, .unsigned = 0 };
+        for (header.bytes, 0..) |v, i| {
+            const b = if (148 <= i and i < 156) 32 else v; // Treating chksum bytes as spaces.
+            cs.unsigned += b;
+            cs.signed += @as(i8, @bitCast(b));
+        }
+        return cs;
+    }
+
+    // Checks calculated chksum with value of chksum field.
+    // Returns error or valid chksum value.
+    // Zero value indicates empty block.
+    pub fn checkChksum(header: Header) !u64 {
+        const field = try header.chksum();
+        const cs = header.computeChksum();
+        if (field == 0 and cs.unsigned == 256) return 0;
+        if (field != cs.unsigned and field != cs.signed) return error.TarHeaderChksum;
+        return field;
    }
 };

-const Buffer = struct {
-    buffer: [512 * 8]u8 = undefined,
-    start: usize = 0,
-    end: usize = 0,
-
-    pub fn readChunk(b: *Buffer, reader: anytype, count: usize) ![]const u8 {
-        b.ensureCapacity(1024);
-
-        const ask = @min(b.buffer.len - b.end, count -| (b.end - b.start));
-        b.end += try reader.readAtLeast(b.buffer[b.end..], ask);
-
-        return b.buffer[b.start..b.end];
+// Breaks string on first null character.
+fn nullStr(str: []const u8) []const u8 {
+    for (str, 0..) |c, i| {
+        if (c == 0) return str[0..i];
+    }
+    return str;
 }

-    pub fn advance(b: *Buffer, count: usize) void {
-        b.start += count;
-        assert(b.start <= b.end);
+/// Iterates over files in tar archive.
+/// `next` returns each file in `reader` tar archive.
+pub fn iterator(reader: anytype, diagnostics: ?*Options.Diagnostics) Iterator(@TypeOf(reader)) {
+    return .{
+        .reader = reader,
+        .diagnostics = diagnostics,
+    };
 }

-    pub fn skip(b: *Buffer, reader: anytype, count: usize) !void {
-        if (b.start + count > b.end) {
-            try reader.skipBytes(b.start + count - b.end, .{});
-            b.start = b.end;
-        } else {
-            b.advance(count);
+fn Iterator(comptime ReaderType: type) type {
+    return struct {
+        reader: ReaderType,
+        diagnostics: ?*Options.Diagnostics,
+
+        // buffers for heeader and file attributes
+        header_buffer: [Header.SIZE]u8 = undefined,
+        file_name_buffer: [std.fs.MAX_PATH_BYTES]u8 = undefined,
+        link_name_buffer: [std.fs.MAX_PATH_BYTES]u8 = undefined,
+
+        // bytes of padding to the end of the block
+        padding: usize = 0,
+        // current tar file
+        file: File = undefined,
+
+        pub const File = struct {
+            name: []const u8, // name of file, symlink or directory
+            link_name: []const u8, // target name of symlink
+            size: u64, // size of the file in bytes
+            mode: u32,
+            kind: Header.Kind,
+
+            reader: ReaderType,
+
+            // Writes file content to writer.
+            pub fn write(self: File, writer: anytype) !void {
+                var buffer: [4096]u8 = undefined;
+
+                var n: u64 = 0;
+                while (n < self.size) {
+                    const buf = buffer[0..@min(buffer.len, self.size - n)];
+                    try self.reader.readNoEof(buf);
+                    try writer.writeAll(buf);
+                    n += buf.len;
                }
            }

-    inline fn ensureCapacity(b: *Buffer, count: usize) void {
-        if (b.buffer.len - b.start < count) {
-            const dest_end = b.end - b.start;
-            @memcpy(b.buffer[0..dest_end], b.buffer[b.start..b.end]);
-            b.end = dest_end;
-            b.start = 0;
+            // Skips file content. Advances reader.
+            pub fn skip(self: File) !void {
+                try self.reader.skipBytes(self.size, .{});
+            }
+        };
+
+        const Self = @This();
+
+        fn readHeader(self: *Self) !?Header {
+            if (self.padding > 0) {
+                try self.reader.skipBytes(self.padding, .{});
+            }
+            const n = try self.reader.readAll(&self.header_buffer);
+            if (n == 0) return null;
+            if (n < Header.SIZE) return error.UnexpectedEndOfStream;
+            const header = Header{ .bytes = self.header_buffer[0..Header.SIZE] };
+            if (try header.checkChksum() == 0) return null;
+            return header;
+        }
+
+        inline fn readString(self: *Self, size: usize, buffer: []u8) ![]const u8 {
+            assert(buffer.len >= size);
+            const buf = buffer[0..size];
+            try self.reader.readNoEof(buf);
+            return nullStr(buf);
+        }
+
+        inline fn initFile(self: *Self) void {
+            self.file = File{
+                .name = self.file_name_buffer[0..0],
+                .link_name = self.link_name_buffer[0..0],
+                .size = 0,
+                .kind = .normal,
+                .mode = 0,
+                .reader = self.reader,
+            };
+        }
+
+        // Number of padding bytes in the last file block.
+        inline fn blockPadding(size: u64) usize {
+            const block_rounded = std.mem.alignForward(u64, size, Header.SIZE); // size rounded to te block boundary
+            return @intCast(block_rounded - size);
+        }
+
+        /// Iterates through the tar archive as if it is a series of files.
+        /// Internally, the tar format often uses entries (header with optional
+        /// content) to add meta data that describes the next file. These
+        /// entries should not normally be visible to the outside. As such, this
+        /// loop iterates through one or more entries until it collects a all
+        /// file attributes.
+        pub fn next(self: *Self) !?File {
+            self.initFile();
+
+            while (try self.readHeader()) |header| {
+                const kind = header.kind();
+                const size: u64 = try header.size();
+                self.padding = blockPadding(size);
+
+                switch (kind) {
+                    // File types to retrun upstream
+                    .directory, .normal, .symbolic_link => {
+                        self.file.kind = kind;
+                        self.file.mode = try header.mode();
+
+                        // set file attributes if not already set by prefix/extended headers
+                        if (self.file.size == 0) {
+                            self.file.size = size;
+                        }
+                        if (self.file.link_name.len == 0) {
+                            self.file.link_name = header.linkName(self.link_name_buffer[0..Header.LINK_NAME_SIZE]);
+                        }
+                        if (self.file.name.len == 0) {
+                            self.file.name = try header.fullName(self.file_name_buffer[0..Header.MAX_NAME_SIZE]);
+                        }
+
+                        self.padding = blockPadding(self.file.size);
+                        return self.file;
+                    },
+                    // Prefix header types
+                    .gnu_long_name => {
+                        self.file.name = try self.readString(@intCast(size), &self.file_name_buffer);
+                    },
+                    .gnu_long_link => {
+                        self.file.link_name = try self.readString(@intCast(size), &self.link_name_buffer);
+                    },
+                    .extended_header => {
+                        // Use just attributes from last extended header.
+                        self.initFile();
+
+                        var rdr = paxIterator(self.reader, @intCast(size));
+                        while (try rdr.next()) |attr| {
+                            switch (attr.kind) {
+                                .path => {
+                                    self.file.name = try attr.value(&self.file_name_buffer);
+                                },
+                                .linkpath => {
+                                    self.file.link_name = try attr.value(&self.link_name_buffer);
+                                },
+                                .size => {
+                                    var buf: [64]u8 = undefined;
+                                    self.file.size = try std.fmt.parseInt(u64, try attr.value(&buf), 10);
+                                },
+                            }
+                        }
+                    },
+                    // Ignored header type
+                    .global_extended_header => {
+                        self.reader.skipBytes(size, .{}) catch return error.TarHeadersTooBig;
+                    },
+                    // All other are unsupported header types
+                    else => {
+                        const d = self.diagnostics orelse return error.TarUnsupportedHeader;
+                        try d.errors.append(d.allocator, .{ .unsupported_file_type = .{
+                            .file_name = try d.allocator.dupe(u8, header.name()),
+                            .file_type = kind,
+                        } });
+                        if (kind == .gnu_sparse) {
+                            try self.skipGnuSparseExtendedHeaders(header);
+                        }
+                        self.reader.skipBytes(size, .{}) catch return error.TarHeadersTooBig;
+                    },
+                }
+            }
+            return null;
+        }
+
+        fn skipGnuSparseExtendedHeaders(self: *Self, header: Header) !void {
+            var is_extended = header.bytes[482] > 0;
+            while (is_extended) {
+                var buf: [Header.SIZE]u8 = undefined;
+                const n = try self.reader.readAll(&buf);
+                if (n < Header.SIZE) return error.UnexpectedEndOfStream;
+                is_extended = buf[504] > 0;
            }
        }
    };
+}
+
+/// Pax attributes iterator.
+/// Size is length of pax extended header in reader.
+fn paxIterator(reader: anytype, size: usize) PaxIterator(@TypeOf(reader)) {
+    return PaxIterator(@TypeOf(reader)){
+        .reader = reader,
+        .size = size,
+    };
+}
+
+const PaxAttributeKind = enum {
+    path,
+    linkpath,
+    size,
+};
+
+fn PaxIterator(comptime ReaderType: type) type {
+    return struct {
+        size: usize, // cumulative size of all pax attributes
+        reader: ReaderType,
+        // scratch buffer used for reading attribute length and keyword
+        scratch: [128]u8 = undefined,
+
+        const Self = @This();
+
+        const Attribute = struct {
+            kind: PaxAttributeKind,
+            len: usize, // length of the attribute value
+            reader: ReaderType, // reader positioned at value start
+
+            // Copies pax attribute value into destination buffer.
+            // Must be called with destination buffer of size at least Attribute.len.
+            pub fn value(self: Attribute, dst: []u8) ![]const u8 {
+                assert(self.len <= dst.len);
+                const buf = dst[0..self.len];
+                const n = try self.reader.readAll(buf);
+                if (n < self.len) return error.UnexpectedEndOfStream;
+                try validateAttributeEnding(self.reader);
+                if (hasNull(buf)) return error.PaxNullInValue;
+                return buf;
+            }
+        };
+
+        // Iterates over pax attributes. Returns known only known attributes.
+        // Caller has to call value in Attribute, to advance reader across value.
+        pub fn next(self: *Self) !?Attribute {
+            // Pax extended header consists of one or more attributes, each constructed as follows:
+            // "%d %s=%s\n", <length>, <keyword>, <value>
+            while (self.size > 0) {
+                const length_buf = try self.readUntil(' ');
+                const length = try std.fmt.parseInt(usize, length_buf, 10); // record length in bytes
+
+                const keyword = try self.readUntil('=');
+                if (hasNull(keyword)) return error.PaxNullInKeyword;
+
+                // calculate value_len
+                const value_start = length_buf.len + keyword.len + 2; // 2 separators
+                if (length < value_start + 1 or self.size < length) return error.UnexpectedEndOfStream;
+                const value_len = length - value_start - 1; // \n separator at end
+                self.size -= length;
+
+                const kind: PaxAttributeKind = if (eql(keyword, "path"))
+                    .path
+                else if (eql(keyword, "linkpath"))
+                    .linkpath
+                else if (eql(keyword, "size"))
+                    .size
+                else {
+                    try self.reader.skipBytes(value_len, .{});
+                    try validateAttributeEnding(self.reader);
+                    continue;
+                };
+                return Attribute{
+                    .kind = kind,
+                    .len = value_len,
+                    .reader = self.reader,
+                };
+            }
+
+            return null;
+        }
+
+        inline fn readUntil(self: *Self, delimiter: u8) ![]const u8 {
+            var fbs = std.io.fixedBufferStream(&self.scratch);
+            try self.reader.streamUntilDelimiter(fbs.writer(), delimiter, null);
+            return fbs.getWritten();
+        }
+
+        inline fn eql(a: []const u8, b: []const u8) bool {
+            return std.mem.eql(u8, a, b);
+        }
+
+        inline fn hasNull(str: []const u8) bool {
+            return (std.mem.indexOfScalar(u8, str, 0)) != null;
+        }
+
+        // Checks that each record ends with new line.
+        inline fn validateAttributeEnding(reader: ReaderType) !void {
+            if (try reader.readByte() != '\n') return error.PaxInvalidAttributeEnd;
+        }
+    };
+}

 pub fn pipeToFileSystem(dir: std.fs.Dir, reader: anytype, options: Options) !void {
    switch (options.mode_mode) {
@ -186,39 +531,21 @@ pub fn pipeToFileSystem(dir: std.fs.Dir, reader: anytype, options: Options) !voi
            @panic("TODO: unimplemented: tar ModeMode.executable_bit_only");
        },
    }
-    var file_name_buffer: [std.fs.MAX_PATH_BYTES]u8 = undefined;
-    var file_name_override_len: usize = 0;
-    var buffer: Buffer = .{};
-    header: while (true) {
-        const chunk = try buffer.readChunk(reader, 1024);
-        switch (chunk.len) {
-            0 => return,
-            1...511 => return error.UnexpectedEndOfStream,
-            else => {},
-        }
-        buffer.advance(512);

-        const header: Header = .{ .bytes = chunk[0..512] };
-        const file_size = try header.fileSize();
-        const rounded_file_size = std.mem.alignForward(u64, file_size, 512);
-        const pad_len: usize = @intCast(rounded_file_size - file_size);
-        const unstripped_file_name = if (file_name_override_len > 0)
-            file_name_buffer[0..file_name_override_len]
-        else
-            try header.fullFileName(&file_name_buffer);
-        file_name_override_len = 0;
-        switch (header.fileType()) {
+    var iter = iterator(reader, options.diagnostics);
+    while (try iter.next()) |file| {
+        switch (file.kind) {
            .directory => {
-                const file_name = try stripComponents(unstripped_file_name, options.strip_components);
+                const file_name = try stripComponents(file.name, options.strip_components);
                if (file_name.len != 0 and !options.exclude_empty_directories) {
                    try dir.makePath(file_name);
                }
            },
            .normal => {
-                if (file_size == 0 and unstripped_file_name.len == 0) return;
-                const file_name = try stripComponents(unstripped_file_name, options.strip_components);
+                if (file.size == 0 and file.name.len == 0) return;
+                const file_name = try stripComponents(file.name, options.strip_components);

-                const file = dir.createFile(file_name, .{}) catch |err| switch (err) {
+                const fs_file = dir.createFile(file_name, .{}) catch |err| switch (err) {
                    error.FileNotFound => again: {
                        const code = code: {
                            if (std.fs.path.dirname(file_name)) |dir_name| {
@ -238,70 +565,19 @@ pub fn pipeToFileSystem(dir: std.fs.Dir, reader: anytype, options: Options) !voi
                    },
                    else => |e| return e,
                };
-                defer if (file) |f| f.close();
+                defer if (fs_file) |f| f.close();

-                var file_off: usize = 0;
-                while (true) {
-                    const temp = try buffer.readChunk(reader, @intCast(rounded_file_size + 512 - file_off));
-                    if (temp.len == 0) return error.UnexpectedEndOfStream;
-                    const slice = temp[0..@intCast(@min(file_size - file_off, temp.len))];
-                    if (file) |f| try f.writeAll(slice);
-
-                    file_off += slice.len;
-                    buffer.advance(slice.len);
-                    if (file_off >= file_size) {
-                        buffer.advance(pad_len);
-                        continue :header;
-                    }
+                if (fs_file) |f| {
+                    try file.write(f);
+                } else {
+                    try file.skip();
                }
            },
-            .extended_header => {
-                if (file_size == 0) {
-                    buffer.advance(@intCast(rounded_file_size));
-                    continue;
-                }
-
-                const chunk_size: usize = @intCast(rounded_file_size + 512);
-                var data_off: usize = 0;
-                file_name_override_len = while (data_off < file_size) {
-                    const slice = try buffer.readChunk(reader, chunk_size - data_off);
-                    if (slice.len == 0) return error.UnexpectedEndOfStream;
-                    const remaining_size: usize = @intCast(file_size - data_off);
-                    const attr_info = try parsePaxAttribute(slice[0..@min(remaining_size, slice.len)], remaining_size);
-
-                    if (std.mem.eql(u8, attr_info.key, "path")) {
-                        if (attr_info.value_len > file_name_buffer.len) return error.NameTooLong;
-                        buffer.advance(attr_info.value_off);
-                        data_off += attr_info.value_off;
-                        break attr_info.value_len;
-                    }
-
-                    try buffer.skip(reader, attr_info.size);
-                    data_off += attr_info.size;
-                } else 0;
-
-                var i: usize = 0;
-                while (i < file_name_override_len) {
-                    const slice = try buffer.readChunk(reader, chunk_size - data_off - i);
-                    if (slice.len == 0) return error.UnexpectedEndOfStream;
-                    const copy_size: usize = @intCast(@min(file_name_override_len - i, slice.len));
-                    @memcpy(file_name_buffer[i .. i + copy_size], slice[0..copy_size]);
-                    buffer.advance(copy_size);
-                    i += copy_size;
-                }
-
-                try buffer.skip(reader, @intCast(rounded_file_size - data_off - file_name_override_len));
-                continue :header;
-            },
-            .global_extended_header => {
-                buffer.skip(reader, @intCast(rounded_file_size)) catch return error.TarHeadersTooBig;
-            },
-            .hard_link => return error.TarUnsupportedFileType,
            .symbolic_link => {
                // The file system path of the symbolic link.
-                const file_name = try stripComponents(unstripped_file_name, options.strip_components);
+                const file_name = try stripComponents(file.name, options.strip_components);
                // The data inside the symbolic link.
-                const link_name = header.linkName();
+                const link_name = file.link_name;

                dir.symLink(link_name, file_name, .{}) catch |err| again: {
                    const code = code: {
@ -323,13 +599,7 @@ pub fn pipeToFileSystem(dir: std.fs.Dir, reader: anytype, options: Options) !voi
                    } });
                };
            },
-            else => |file_type| {
-                const d = options.diagnostics orelse return error.TarUnsupportedFileType;
-                try d.errors.append(d.allocator, .{ .unsupported_file_type = .{
-                    .file_name = try d.allocator.dupe(u8, unstripped_file_name),
-                    .file_type = file_type,
-                } });
-            },
+            else => unreachable,
        }
    }
 }
@ -347,51 +617,137 @@ fn stripComponents(path: []const u8, count: u32) ![]const u8 {
    return path[i..];
 }

-test stripComponents {
+test "tar stripComponents" {
    const expectEqualStrings = std.testing.expectEqualStrings;
    try expectEqualStrings("a/b/c", try stripComponents("a/b/c", 0));
    try expectEqualStrings("b/c", try stripComponents("a/b/c", 1));
    try expectEqualStrings("c", try stripComponents("a/b/c", 2));
 }

-const PaxAttributeInfo = struct {
-    size: usize,
-    key: []const u8,
-    value_off: usize,
-    value_len: usize,
+test "tar PaxIterator" {
+    const Attr = struct {
+        kind: PaxAttributeKind,
+        value: []const u8 = undefined,
+        err: ?anyerror = null,
    };
+    const cases = [_]struct {
+        data: []const u8,
+        attrs: []const Attr,
+        err: ?anyerror = null,
+    }{
+        .{ // valid but unknown keys
+            .data =
+            \\30 mtime=1350244992.023960108
+            \\6 k=1
+            \\13 key1=val1
+            \\10 a=name
+            \\9 a=name
+            \\
+            ,
+            .attrs = &[_]Attr{},
+        },
+        .{ // mix of known and unknown keys
+            .data =
+            \\6 k=1
+            \\13 path=name
+            \\17 linkpath=link
+            \\13 key1=val1
+            \\12 size=123
+            \\13 key2=val2
+            \\
+            ,
+            .attrs = &[_]Attr{
+                .{ .kind = .path, .value = "name" },
+                .{ .kind = .linkpath, .value = "link" },
+                .{ .kind = .size, .value = "123" },
+            },
+        },
+        .{ // too short size of the second key-value pair
+            .data =
+            \\13 path=name
+            \\10 linkpath=value
+            \\
+            ,
+            .attrs = &[_]Attr{
+                .{ .kind = .path, .value = "name" },
+            },
+            .err = error.UnexpectedEndOfStream,
+        },
+        .{ // too long size of the second key-value pair
+            .data =
+            \\13 path=name
+            \\6 k=1
+            \\19 linkpath=value
+            \\
+            ,
+            .attrs = &[_]Attr{
+                .{ .kind = .path, .value = "name" },
+            },
+            .err = error.UnexpectedEndOfStream,
+        },

-fn parsePaxAttribute(data: []const u8, max_size: usize) !PaxAttributeInfo {
-    const pos_space = std.mem.indexOfScalar(u8, data, ' ') orelse return error.InvalidPaxAttribute;
-    const pos_equals = std.mem.indexOfScalarPos(u8, data, pos_space, '=') orelse return error.InvalidPaxAttribute;
-    const kv_size = try std.fmt.parseInt(usize, data[0..pos_space], 10);
-    if (kv_size > max_size) {
-        return error.InvalidPaxAttribute;
-    }
-    return .{
-        .size = kv_size,
-        .key = data[pos_space + 1 .. pos_equals],
-        .value_off = pos_equals + 1,
-        .value_len = kv_size - pos_equals - 2,
+        .{ // too long size of the second key-value pair
+            .data =
+            \\13 path=name
+            \\19 linkpath=value
+            \\6 k=1
+            \\
+            ,
+            .attrs = &[_]Attr{
+                .{ .kind = .path, .value = "name" },
+                .{ .kind = .linkpath, .err = error.PaxInvalidAttributeEnd },
+            },
+        },
+        .{ // null in keyword is not valid
+            .data = "13 path=name\n" ++ "7 k\x00b=1\n",
+            .attrs = &[_]Attr{
+                .{ .kind = .path, .value = "name" },
+            },
+            .err = error.PaxNullInKeyword,
+        },
+        .{ // null in value is not valid
+            .data = "23 path=name\x00with null\n",
+            .attrs = &[_]Attr{
+                .{ .kind = .path, .err = error.PaxNullInValue },
+            },
+        },
+        .{ // 1000 characters path
+            .data = "1011 path=" ++ "0123456789" ** 100 ++ "\n",
+            .attrs = &[_]Attr{
+                .{ .kind = .path, .value = "0123456789" ** 100 },
+            },
+        },
    };
+    var buffer: [1024]u8 = undefined;
+
+    outer: for (cases) |case| {
+        var stream = std.io.fixedBufferStream(case.data);
+        var iter = paxIterator(stream.reader(), case.data.len);
+
+        var i: usize = 0;
+        while (iter.next() catch |err| {
+            if (case.err) |e| {
+                try std.testing.expectEqual(e, err);
+                continue;
+            }
+            return err;
+        }) |attr| : (i += 1) {
+            const exp = case.attrs[i];
+            try std.testing.expectEqual(exp.kind, attr.kind);
+            const value = attr.value(&buffer) catch |err| {
+                if (exp.err) |e| {
+                    try std.testing.expectEqual(e, err);
+                    break :outer;
+                }
+                return err;
+            };
+            try std.testing.expectEqualStrings(exp.value, value);
+        }
+        try std.testing.expectEqual(case.attrs.len, i);
+        try std.testing.expect(case.err == null);
+    }
 }

-test parsePaxAttribute {
-    const expectEqual = std.testing.expectEqual;
-    const expectEqualStrings = std.testing.expectEqualStrings;
-    const expectError = std.testing.expectError;
-    const prefix = "1011 path=";
-    const file_name = "0123456789" ** 100;
-    const header = prefix ++ file_name ++ "\n";
-    const attr_info = try parsePaxAttribute(header, 1011);
-    try expectEqual(@as(usize, 1011), attr_info.size);
-    try expectEqualStrings("path", attr_info.key);
-    try expectEqual(prefix.len, attr_info.value_off);
-    try expectEqual(file_name.len, attr_info.value_len);
-    try expectEqual(attr_info, try parsePaxAttribute(header, 1012));
-    try expectError(error.InvalidPaxAttribute, parsePaxAttribute(header, 1010));
-    try expectError(error.InvalidPaxAttribute, parsePaxAttribute("", 0));
+test {
+    _ = @import("tar/test.zig");
 }
-
-const std = @import("std.zig");
-const assert = std.debug.assert;
--- a/lib/std/tar/test.zig
+++ b/lib/std/tar/test.zig
@ -0,0 +1,367 @@
+const std = @import("../std.zig");
+const tar = std.tar;
+const testing = std.testing;
+
+test "tar run Go test cases" {
+    const Case = struct {
+        const File = struct {
+            name: []const u8,
+            size: u64 = 0,
+            mode: u32 = 0,
+            link_name: []const u8 = &[0]u8{},
+            kind: tar.Header.Kind = .normal,
+            truncated: bool = false, // when there is no file body, just header, usefull for huge files
+        };
+
+        data: []const u8, // testdata file content
+        files: []const File = &[_]@This().File{}, // expected files to found in archive
+        chksums: []const []const u8 = &[_][]const u8{}, // chksums of each file content
+        err: ?anyerror = null, // parsing should fail with this error
+    };
+
+    const cases = [_]Case{
+        .{
+            .data = @embedFile("testdata/gnu.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "small.txt",
+                    .size = 5,
+                    .mode = 0o640,
+                },
+                .{
+                    .name = "small2.txt",
+                    .size = 11,
+                    .mode = 0o640,
+                },
+            },
+            .chksums = &[_][]const u8{
+                "e38b27eaccb4391bdec553a7f3ae6b2f",
+                "c65bd2e50a56a2138bf1716f2fd56fe9",
+            },
+        },
+        .{
+            .data = @embedFile("testdata/sparse-formats.tar"),
+            .err = error.TarUnsupportedHeader,
+        },
+        .{
+            .data = @embedFile("testdata/star.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "small.txt",
+                    .size = 5,
+                    .mode = 0o640,
+                },
+                .{
+                    .name = "small2.txt",
+                    .size = 11,
+                    .mode = 0o640,
+                },
+            },
+            .chksums = &[_][]const u8{
+                "e38b27eaccb4391bdec553a7f3ae6b2f",
+                "c65bd2e50a56a2138bf1716f2fd56fe9",
+            },
+        },
+        .{
+            .data = @embedFile("testdata/v7.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "small.txt",
+                    .size = 5,
+                    .mode = 0o444,
+                },
+                .{
+                    .name = "small2.txt",
+                    .size = 11,
+                    .mode = 0o444,
+                },
+            },
+            .chksums = &[_][]const u8{
+                "e38b27eaccb4391bdec553a7f3ae6b2f",
+                "c65bd2e50a56a2138bf1716f2fd56fe9",
+            },
+        },
+        .{
+            .data = @embedFile("testdata/pax.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "a/123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100",
+                    .size = 7,
+                    .mode = 0o664,
+                },
+                .{
+                    .name = "a/b",
+                    .size = 0,
+                    .kind = .symbolic_link,
+                    .mode = 0o777,
+                    .link_name = "123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100",
+                },
+            },
+            .chksums = &[_][]const u8{
+                "3c382e8f5b6631aa2db52643912ffd4a",
+            },
+        },
+        .{
+            // pax attribute don't end with \n
+            .data = @embedFile("testdata/pax-bad-hdr-file.tar"),
+            .err = error.PaxInvalidAttributeEnd,
+        },
+        .{
+            // size is in pax attribute
+            .data = @embedFile("testdata/pax-pos-size-file.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "foo",
+                    .size = 999,
+                    .kind = .normal,
+                    .mode = 0o640,
+                },
+            },
+            .chksums = &[_][]const u8{
+                "0afb597b283fe61b5d4879669a350556",
+            },
+        },
+        .{
+            // has pax records which we are not interested in
+            .data = @embedFile("testdata/pax-records.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "file",
+                },
+            },
+        },
+        .{
+            // has global records which we are ignoring
+            .data = @embedFile("testdata/pax-global-records.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "file1",
+                },
+                .{
+                    .name = "file2",
+                },
+                .{
+                    .name = "file3",
+                },
+                .{
+                    .name = "file4",
+                },
+            },
+        },
+        .{
+            .data = @embedFile("testdata/nil-uid.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "P1050238.JPG.log",
+                    .size = 14,
+                    .kind = .normal,
+                    .mode = 0o664,
+                },
+            },
+            .chksums = &[_][]const u8{
+                "08d504674115e77a67244beac19668f5",
+            },
+        },
+        .{
+            // has xattrs and pax records which we are ignoring
+            .data = @embedFile("testdata/xattrs.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "small.txt",
+                    .size = 5,
+                    .kind = .normal,
+                    .mode = 0o644,
+                },
+                .{
+                    .name = "small2.txt",
+                    .size = 11,
+                    .kind = .normal,
+                    .mode = 0o644,
+                },
+            },
+            .chksums = &[_][]const u8{
+                "e38b27eaccb4391bdec553a7f3ae6b2f",
+                "c65bd2e50a56a2138bf1716f2fd56fe9",
+            },
+        },
+        .{
+            .data = @embedFile("testdata/gnu-multi-hdrs.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "GNU2/GNU2/long-path-name",
+                    .link_name = "GNU4/GNU4/long-linkpath-name",
+                    .kind = .symbolic_link,
+                },
+            },
+        },
+        .{
+            // has gnu type D (directory) and S (sparse) blocks
+            .data = @embedFile("testdata/gnu-incremental.tar"),
+            .err = error.TarUnsupportedHeader,
+        },
+        .{
+            // should use values only from last pax header
+            .data = @embedFile("testdata/pax-multi-hdrs.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "bar",
+                    .link_name = "PAX4/PAX4/long-linkpath-name",
+                    .kind = .symbolic_link,
+                },
+            },
+        },
+        .{
+            .data = @embedFile("testdata/gnu-long-nul.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "0123456789",
+                    .mode = 0o644,
+                },
+            },
+        },
+        .{
+            .data = @embedFile("testdata/gnu-utf8.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹",
+                    .mode = 0o644,
+                },
+            },
+        },
+        .{
+            .data = @embedFile("testdata/gnu-not-utf8.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "hi\x80\x81\x82\x83bye",
+                    .mode = 0o644,
+                },
+            },
+        },
+        .{
+            // null in pax key
+            .data = @embedFile("testdata/pax-nul-xattrs.tar"),
+            .err = error.PaxNullInKeyword,
+        },
+        .{
+            .data = @embedFile("testdata/pax-nul-path.tar"),
+            .err = error.PaxNullInValue,
+        },
+        .{
+            .data = @embedFile("testdata/neg-size.tar"),
+            .err = error.TarHeader,
+        },
+        .{
+            .data = @embedFile("testdata/issue10968.tar"),
+            .err = error.TarHeader,
+        },
+        .{
+            .data = @embedFile("testdata/issue11169.tar"),
+            .err = error.TarHeader,
+        },
+        .{
+            .data = @embedFile("testdata/issue12435.tar"),
+            .err = error.TarHeaderChksum,
+        },
+        .{
+            // has magic with space at end instead of null
+            .data = @embedFile("testdata/invalid-go17.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/foo",
+                },
+            },
+        },
+        .{
+            .data = @embedFile("testdata/ustar-file-devs.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "file",
+                    .mode = 0o644,
+                },
+            },
+        },
+        .{
+            .data = @embedFile("testdata/trailing-slash.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "123456789/" ** 30,
+                    .kind = .directory,
+                },
+            },
+        },
+        .{
+            // Has size in gnu extended format. To represent size bigger than 8 GB.
+            .data = @embedFile("testdata/writer-big.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "tmp/16gig.txt",
+                    .size = 16 * 1024 * 1024 * 1024,
+                    .truncated = true,
+                    .mode = 0o640,
+                },
+            },
+        },
+        .{
+            // Size in gnu extended format, and name in pax attribute.
+            .data = @embedFile("testdata/writer-big-long.tar"),
+            .files = &[_]Case.File{
+                .{
+                    .name = "longname/" ** 15 ++ "16gig.txt",
+                    .size = 16 * 1024 * 1024 * 1024,
+                    .mode = 0o644,
+                    .truncated = true,
+                },
+            },
+        },
+    };
+
+    for (cases) |case| {
+        var fsb = std.io.fixedBufferStream(case.data);
+        var iter = tar.iterator(fsb.reader(), null);
+        var i: usize = 0;
+        while (iter.next() catch |err| {
+            if (case.err) |e| {
+                try testing.expectEqual(e, err);
+                continue;
+            } else {
+                return err;
+            }
+        }) |actual| : (i += 1) {
+            const expected = case.files[i];
+            try testing.expectEqualStrings(expected.name, actual.name);
+            try testing.expectEqual(expected.size, actual.size);
+            try testing.expectEqual(expected.kind, actual.kind);
+            try testing.expectEqual(expected.mode, actual.mode);
+            try testing.expectEqualStrings(expected.link_name, actual.link_name);
+
+            if (case.chksums.len > i) {
+                var md5writer = Md5Writer{};
+                try actual.write(&md5writer);
+                const chksum = md5writer.chksum();
+                try testing.expectEqualStrings(case.chksums[i], &chksum);
+            } else {
+                if (!expected.truncated) try actual.skip(); // skip file content
+            }
+        }
+        try testing.expectEqual(case.files.len, i);
+    }
+}
+
+// used in test to calculate file chksum
+const Md5Writer = struct {
+    h: std.crypto.hash.Md5 = std.crypto.hash.Md5.init(.{}),
+
+    pub fn writeAll(self: *Md5Writer, buf: []const u8) !void {
+        self.h.update(buf);
+    }
+
+    pub fn writeByte(self: *Md5Writer, byte: u8) !void {
+        self.h.update(&[_]u8{byte});
+    }
+
+    pub fn chksum(self: *Md5Writer) [32]u8 {
+        var s = [_]u8{0} ** 16;
+        self.h.final(&s);
+        return std.fmt.bytesToHex(s, .lower);
+    }
+};
--- a/lib/std/tar/testdata/gnu-incremental.tar
+++ b/lib/std/tar/testdata/gnu-incremental.tar
--- a/lib/std/tar/testdata/gnu-long-nul.tar
+++ b/lib/std/tar/testdata/gnu-long-nul.tar
--- a/lib/std/tar/testdata/gnu-multi-hdrs.tar
+++ b/lib/std/tar/testdata/gnu-multi-hdrs.tar
--- a/lib/std/tar/testdata/gnu-not-utf8.tar
+++ b/lib/std/tar/testdata/gnu-not-utf8.tar
--- a/lib/std/tar/testdata/gnu-utf8.tar
+++ b/lib/std/tar/testdata/gnu-utf8.tar
--- a/lib/std/tar/testdata/gnu.tar
+++ b/lib/std/tar/testdata/gnu.tar
--- a/lib/std/tar/testdata/invalid-go17.tar
+++ b/lib/std/tar/testdata/invalid-go17.tar
--- a/lib/std/tar/testdata/issue10968.tar
+++ b/lib/std/tar/testdata/issue10968.tar
--- a/lib/std/tar/testdata/issue11169.tar
+++ b/lib/std/tar/testdata/issue11169.tar
--- a/lib/std/tar/testdata/issue12435.tar
+++ b/lib/std/tar/testdata/issue12435.tar
--- a/lib/std/tar/testdata/neg-size.tar
+++ b/lib/std/tar/testdata/neg-size.tar
--- a/lib/std/tar/testdata/nil-uid.tar
+++ b/lib/std/tar/testdata/nil-uid.tar
--- a/lib/std/tar/testdata/pax-bad-hdr-file.tar
+++ b/lib/std/tar/testdata/pax-bad-hdr-file.tar
--- a/lib/std/tar/testdata/pax-global-records.tar
+++ b/lib/std/tar/testdata/pax-global-records.tar
--- a/lib/std/tar/testdata/pax-multi-hdrs.tar
+++ b/lib/std/tar/testdata/pax-multi-hdrs.tar
--- a/lib/std/tar/testdata/pax-nul-path.tar
+++ b/lib/std/tar/testdata/pax-nul-path.tar
--- a/lib/std/tar/testdata/pax-nul-xattrs.tar
+++ b/lib/std/tar/testdata/pax-nul-xattrs.tar
--- a/lib/std/tar/testdata/pax-pos-size-file.tar
+++ b/lib/std/tar/testdata/pax-pos-size-file.tar
--- a/lib/std/tar/testdata/pax-records.tar
+++ b/lib/std/tar/testdata/pax-records.tar
--- a/lib/std/tar/testdata/pax.tar
+++ b/lib/std/tar/testdata/pax.tar
--- a/lib/std/tar/testdata/sparse-formats.tar
+++ b/lib/std/tar/testdata/sparse-formats.tar
--- a/lib/std/tar/testdata/star.tar
+++ b/lib/std/tar/testdata/star.tar
--- a/lib/std/tar/testdata/trailing-slash.tar
+++ b/lib/std/tar/testdata/trailing-slash.tar
--- a/lib/std/tar/testdata/ustar-file-devs.tar
+++ b/lib/std/tar/testdata/ustar-file-devs.tar
--- a/lib/std/tar/testdata/v7.tar
+++ b/lib/std/tar/testdata/v7.tar
--- a/lib/std/tar/testdata/writer-big-long.tar
+++ b/lib/std/tar/testdata/writer-big-long.tar
--- a/lib/std/tar/testdata/writer-big.tar
+++ b/lib/std/tar/testdata/writer-big.tar
--- a/lib/std/tar/testdata/xattrs.tar
+++ b/lib/std/tar/testdata/xattrs.tar