From 9fe4c89230df2d78c8bf37b4b1d7a9bedb92677b Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Fri, 11 Sep 2020 22:17:08 +0200 Subject: [PATCH] std: Add a gzip decoder --- build.zig | 1 + lib/std/compress.zig | 2 + lib/std/compress/gzip.zig | 248 ++++++++++++++++++++++++++++++++ lib/std/compress/rfc1952.txt.gz | Bin 0 -> 8059 bytes 4 files changed, 251 insertions(+) create mode 100644 lib/std/compress/gzip.zig create mode 100644 lib/std/compress/rfc1952.txt.gz diff --git a/build.zig b/build.zig index 3f7f1a9038..a6a2d87371 100644 --- a/build.zig +++ b/build.zig @@ -128,6 +128,7 @@ pub fn build(b: *Builder) !void { "README.md", ".z.0", ".z.9", + ".gz", "rfc1951.txt", }, }); diff --git a/lib/std/compress.zig b/lib/std/compress.zig index 5518f807df..95f496021e 100644 --- a/lib/std/compress.zig +++ b/lib/std/compress.zig @@ -6,8 +6,10 @@ const std = @import("std.zig"); pub const deflate = @import("compress/deflate.zig"); +pub const gzip = @import("compress/gzip.zig"); pub const zlib = @import("compress/zlib.zig"); test "" { + _ = gzip; _ = zlib; } diff --git a/lib/std/compress/gzip.zig b/lib/std/compress/gzip.zig new file mode 100644 index 0000000000..aad1731393 --- /dev/null +++ b/lib/std/compress/gzip.zig @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2015-2020 Zig Contributors +// This file is part of [zig](https://ziglang.org/), which is MIT licensed. +// The MIT license requires this copyright notice to be included in all copies +// and substantial portions of the software. +// +// Decompressor for GZIP data streams (RFC1952) + +const std = @import("std"); +const io = std.io; +const fs = std.fs; +const testing = std.testing; +const mem = std.mem; +const deflate = std.compress.deflate; + +// Flags for the FLG field in the header +const FTEXT = 1 << 0; +const FHCRC = 1 << 1; +const FEXTRA = 1 << 2; +const FNAME = 1 << 3; +const FCOMMENT = 1 << 4; + +pub fn GzipStream(comptime ReaderType: type) type { + return struct { + const Self = @This(); + + pub const Error = ReaderType.Error || + deflate.InflateStream(ReaderType).Error || + error{ CorruptedData, WrongChecksum }; + pub const Reader = io.Reader(*Self, Error, read); + + allocator: *mem.Allocator, + inflater: deflate.InflateStream(ReaderType), + in_reader: ReaderType, + hasher: std.hash.Crc32, + window_slice: []u8, + read_amt: usize, + + info: struct { + filename: ?[]const u8, + comment: ?[]const u8, + modification_time: u32, + }, + + fn init(allocator: *mem.Allocator, source: ReaderType) !Self { + // gzip header format is specified in RFC1952 + const header = try source.readBytesNoEof(10); + + // Check the ID1/ID2 fields + if (header[0] != 0x1f or header[1] != 0x8b) + return error.BadHeader; + + const CM = header[2]; + // The CM field must be 8 to indicate the use of DEFLATE + if (CM != 8) return error.InvalidCompression; + // Flags + const FLG = header[3]; + // Modification time, as a Unix timestamp. + // If zero there's no timestamp available. + const MTIME = mem.readIntLittle(u32, header[4..8]); + // Extra flags + const XFL = header[8]; + // Operating system where the compression took place + const OS = header[9]; + + if (FLG & FEXTRA != 0) { + // Skip the extra data, we could read and expose it to the user + // if somebody needs it. + const len = try source.readIntLittle(u16); + try source.skipBytes(len, .{}); + } + + var filename: ?[]const u8 = null; + if (FLG & FNAME != 0) { + filename = try source.readUntilDelimiterAlloc( + allocator, + 0, + std.math.maxInt(usize), + ); + } + errdefer if (filename) |p| allocator.free(p); + + var comment: ?[]const u8 = null; + if (FLG & FCOMMENT != 0) { + comment = try source.readUntilDelimiterAlloc( + allocator, + 0, + std.math.maxInt(usize), + ); + } + errdefer if (comment) |p| allocator.free(p); + + if (FLG & FHCRC != 0) { + // TODO: Evaluate and check the header checksum. The stdlib has + // no CRC16 yet :( + _ = try source.readIntLittle(u16); + } + + // The RFC doesn't say anything about the DEFLATE window size to be + // used, default to 32K. + var window_slice = try allocator.alloc(u8, 32 * 1024); + + return Self{ + .allocator = allocator, + .inflater = deflate.inflateStream(source, window_slice), + .in_reader = source, + .hasher = std.hash.Crc32.init(), + .window_slice = window_slice, + .info = .{ + .filename = filename, + .comment = comment, + .modification_time = MTIME, + }, + .read_amt = 0, + }; + } + + pub fn deinit(self: *Self) void { + self.allocator.free(self.window_slice); + if (self.info.filename) |filename| + self.allocator.free(filename); + if (self.info.comment) |comment| + self.allocator.free(comment); + } + + // Implements the io.Reader interface + pub fn read(self: *Self, buffer: []u8) Error!usize { + if (buffer.len == 0) + return 0; + + // Read from the compressed stream and update the computed checksum + const r = try self.inflater.read(buffer); + if (r != 0) { + self.hasher.update(buffer[0..r]); + self.read_amt += r; + return r; + } + + // We've reached the end of stream, check if the checksum matches + const hash = try self.in_reader.readIntLittle(u32); + if (hash != self.hasher.final()) + return error.WrongChecksum; + + // The ISIZE field is the size of the uncompressed input modulo 2^32 + const input_size = try self.in_reader.readIntLittle(u32); + if (self.read_amt & 0xffffffff != input_size) + return error.CorruptedData; + + return 0; + } + + pub fn reader(self: *Self) Reader { + return .{ .context = self }; + } + }; +} + +pub fn gzipStream(allocator: *mem.Allocator, reader: anytype) !GzipStream(@TypeOf(reader)) { + return GzipStream(@TypeOf(reader)).init(allocator, reader); +} + +fn testReader(data: []const u8, comptime expected: []const u8) !void { + var in_stream = io.fixedBufferStream(data); + + var gzip_stream = try gzipStream(testing.allocator, in_stream.reader()); + defer gzip_stream.deinit(); + + // Read and decompress the whole file + const buf = try gzip_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize)); + defer testing.allocator.free(buf); + // Calculate its SHA256 hash and check it against the reference + var hash: [32]u8 = undefined; + std.crypto.hash.sha2.Sha256.hash(buf, hash[0..], .{}); + + assertEqual(expected, &hash); +} + +// Assert `expected` == `input` where `input` is a bytestring. +pub fn assertEqual(comptime expected: []const u8, input: []const u8) void { + var expected_bytes: [expected.len / 2]u8 = undefined; + for (expected_bytes) |*r, i| { + r.* = std.fmt.parseInt(u8, expected[2 * i .. 2 * i + 2], 16) catch unreachable; + } + + testing.expectEqualSlices(u8, &expected_bytes, input); +} + +// All the test cases are obtained by compressing the RFC1952 text +// +// https://tools.ietf.org/rfc/rfc1952.txt length=25037 bytes +// SHA256=164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67 +test "compressed data" { + try testReader( + @embedFile("rfc1952.txt.gz"), + "164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67", + ); +} + +test "sanity checks" { + // Truncated header + testing.expectError( + error.EndOfStream, + testReader(&[_]u8{ 0x1f, 0x8B }, ""), + ); + // Wrong CM + testing.expectError( + error.InvalidCompression, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, + }, ""), + ); + // Wrong checksum + testing.expectError( + error.WrongChecksum, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + }, ""), + ); + // Truncated checksum + testing.expectError( + error.EndOfStream, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, + }, ""), + ); + // Wrong initial size + testing.expectError( + error.CorruptedData, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + }, ""), + ); + // Truncated initial size field + testing.expectError( + error.EndOfStream, + testReader(&[_]u8{ + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, + }, ""), + ); +} diff --git a/lib/std/compress/rfc1952.txt.gz b/lib/std/compress/rfc1952.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..be43b90a7917a993933c3266db50882f77a5662e GIT binary patch literal 8059 zcmV->AB5l^iwFogG^;TH19E0#F*!9dE_8Tw0JU6gbK6Fe{;s+oF{=+3GA1FCl5AOy zbE}mtM=43xT5^)woU;W2LlPwr;NnFuIq!eJr@Lo>0SL)fR$;{vC184bdiuS4W@Cds zXS%pavn%y8{ud>4^)5@xmHOJxxu@RfvdDwQ#s~ecQs;%5rI|WRmrI=#`Mw&z+?_ly z{vh_lFiO+LzC27JcYW}Sx!H9yzPyRi^4C;T%|L0xrlOgs+TFlbH);&FvZX$wMg74KdDFMg`b3e7SjBFa;LC0AppFI z@*<0-B~O*k2uxAogP&y*N6VRe;l6(jLnI$K6L?*@R)GatBwsz@=@iYOJT(2_zoC*&Of z`UGWFW>}==_iv(Nk(Stxg`ds!0Eh;084|}AfS$*^eYBUDBn3G#0ucMjy!7Xw7lFW# z6aX{XX?_#CyNb-%f#nDwx&U(l2*C)dB_~V>2E{1b-!qMoI@j5?4hQBj#+yat<&?Og zh^{pupAa@!VyI&+D9^HV$s3qrnusKYAO^AavzQ?){p^Zaf&>oR9ub!jF=-HxC#ki( zkG*Hy2#ZY9!>}T)T{pq`;qvsvN>KpLosvwI7$gfe#415QCRh;H| ztn-}r1mw&KbHDH{mQ2$qmn~VYFnkI{Ky1MX-(ch7J5|y!))Nr2*otkXFobJpK8AA_ z#8Ch`g;=?t1b5gV9bAzll{^IK6%ZieGDZdbla(t%T)+|#1DQD>DFlU_QF`W>lQZr%J98vWOE2%H*!+W4=H4@ZaV281`~h?Mw| zAb-(dhe(Jy-lw0hoh!~__?y}iY7 zcw{>6*z68Kd!2wLk7iEw(f|U5KIj>=8Ag8fR&@b_fL!RH%*cO&ipgP3GXLutK#fNR z5Cr5PxJuF+Fy-91SC2uc0t5j$D52}KZ*p}IhNS=h7@!vbv|2$MMYsFvN9`xW6ABUD zVc+9B0H6OJt*o`|m_d!l2GE`h=)xyUqYghDf?na*Y#RaEXlZPt#cR!LeSR`-*8Q_S zO;g122N+559~}3=jDNt1*mvWKZ`Cn5jiIJ>k^4y|=KP~7?pAOz$zk@{{O5N$oaKQ; zALf?^GqVDOZwg-r(v<5$Ji!I)e&b=WQY5#6Jvea&Up0_9#ETYn@e-@wd|3$GDeW&+ z&&Tr>k%xT+zkv%J;xJvp^7}zRZUdQ4LcC9}u9*1b#)@;{Ky9w5%xU3i5_4$dV_*ai zRZYoFB*V7E7p51lW~svn&O`%3UclKVXT6&e@JozW#UWN;Wh>a23fLk*SxB69EV!AwVH!wF$#y%!JGsJ96?|SU64jM=hN^Wh+?k2n6 zl^~>9-U0)I4V>*}5)ow6+eqkJ#GC|-W^3+s1>1WduE$q4 zOW}jlmMdt$w59*fCFnIu!>9~pVq-^zb_~+eU3*3+AP(Q_DnR~`+bh+6zmAVW@WM3yGOz{Ti1X* zt?z0B1E^=x6~q?2zRJ8vx$~Rf|?djVdFX;G94!%+q-3I#hYpALy{yTG7YhzB^Xhc9kJW=n1I*r zBJt+F+7l^cW)6viI8g~Xq?P$3Jd{^fy&QbQzb7gM#}(iZ!(m69jBYD3;sKh0@YlCs zQb>@@HBgX$$_H1}3KpDQ6E?JBRr5A!8kLkB6I!hiMl%kmtk|+1+w=Mg0g@Ijtg_@X zl;zlrCU1b<55myW@Qbm&qicZS!oQ}8u#yD6nGP1@8^%@g)HrOe3Wldzv07aP77&R(V}8RUP;Q6!_3fnZC(rU%7r@BM@jFil}&8 zsx)Qx6Q9%FFt*TaQKqYeH~UBs2g&P(aH$iy_!8WHtoBv524QatE=3FYM@~ zfjgy`;FC|17k9Dtp3d3Xeqv|m-rUZHWPZA{HI|W%MLi%SArqO`%5$3DsyMomqJ!%R zJsS>(&+1=4^S|$gLn*CjvLztE)Q6L>sk3Cf1}^5qN*gAbYV$a0_;e7lEuZ>T!(><{S;R8&@}FdGf=YZbq$@;Z27SRDYuXim#bjC(Z$xEoqwo*GC@ z$uGm1HlcX;vgl2uy+Ot?%Ie9v{as}f!vbeOyq%?w#7wYAL5KV-z&GIcMLuyjo(8!Nms6t}P- zH-NGZ<#LS=TLDrYWYN^P6GSfpKSYGWzA*>i1QyW+dmWWn{M?p=L2WIU&{Sy;w20|k!)g@?XRr24J; z;{37Ff}gwo_za>))N51dwVJ3~s17%W{Yw0-a4yV_<3_Bb*6F6Klqw|jsP6L8ROsN^Wmxbe01_oc=h!1`1Gwj|K;f9^ZN_% zaF2L*IQ+izf>!BXblag zddF|b8>?WC`|k6}+Ve@@-7B&axrzD32Hb)RxC19=@UPW2<`~_MX9M-WqwN>{0YiPQ z#@jC&fbh=V6z@#N+js|IZWQvocXf5bCpVi^JPeMLKk%FtL09(=$R-@qy_WEr97{5b~2;4ImM+1B_)HCYkYLq+%->AnaBoHUBZ}kJtXO7Rn_&7BtfiT*HcmmY+kw^lBTeFyX!e= z5xZ4g&!2zc`3v`XOPWA#bWWGEO{ zDaX8SmrP6)9Qab>57Tmrb+7L^R>|Fvl!dcv0(MiFKg2nNh{w^5B=c>ylQc>LOqJMl zYZJNjo900jNjZi@!GSz*!f*q|O}bOHg6c@Jb(BC}q*NF-x#=ujWmFZf^o$GWX9~e$ z1b+xbsSs+?EVhQhAZ*P`M3>3kETVFHlOuh}aQN>2nnaBCaD_4IVv2k~*=tsbg{88o z;4~K63912y8Uf}>ni*J%t?!hEze4GUa_r&yvP940jKIEoSIHn3_VOT&trR3h}Xm@>Doi0{D6 zR;35z(|FCryL*o$%ksH zJEn?2?bh>!GLRgjz>%51Rwt0pQu~Q(U>TzKX?+JJO0ip5=`1I%XsC)oyKY=(PB=y5 zL~kYneL64e`Jdut)uiB9r()(rt%oc3vT+>+&<^_?8c69y0&pJ8^*F=6wE4#u?;Ak& z_I6(m;bWZ~Tpphd$9+dzzoM*Ly9+9p)}RKAw|_f)b9`tDskUv|t)aH+q+;LbYmU0-=TE_-Kr0%4J-}>|~nR zy7A&>D8JEU&m}pS`n1dftEM?A>NA*-A--y(Q|A&51f`h*!%kTDEzPKd`=3>CW4@epyolqy^G}HbS)(Y(#hjl&QY#+x2M%GCNvGSNk&9rvg4}0U0 zP4D(wH2q+_Pj=(k5w|i&bah{rL{;xJ4Qq`?f9ml1=V05ss((O20;YyLvWS*W_Fs6l z$x~f&Y3R#?xqrv>{IRxeRyX7W?Vl!LZoHwPMgUYdnA+elfzg9-XeT zXMI|XL#}b)lWRx5cT7r;7~u3`_~!kE(^+FRB10+j=61y?)ePM_1F$DEjK9*7sHlEC z>vB9=S|~Jae4v7nR!)_gEfUeNijrvsWZcS%eB5|3FZ`@v`U(t0qHMca9vKAQSgzK> z3)`CIGRiMUC#t7^A;XQZaK086QhZNwp&0bKyUS_9vK3wz$Ch8=9($?cVH;}qBxvqS zw@%?snYT38Js~G8f@%dr*MVlQd+Fas%W~S|m}o{VbB&?( zSTJ4hePxF)nCrfrl%OtV61qmOlPS90OX?y`uhc5`16>_?^zTg5?yTh28#)$6`j>El zb{wX&A*rDjx7n(^IOY^waT}1;nN`1^?O-%XAHQWI8i|4@OrE{?6Wu!gjIhT?2bZ#H zA%0I3$N>AfH5sV0%dMZIWV@9SG0%QeGB>3I|U>k3dvRr?% z!{fIOPcPP=?b5W+4UxKh-|_f49{+HDbkSIoT}uZNFYx}UA5a1)U#vgg!{dJpg`3x( zzQohR^R3ghCu5@Q<@@uC;bgS_kSKR{*}}04yD<^+?{7NRIVR#A1ZkG2562gW>u*pi z!cdh79rQ?CRY+A5qYLqMUM!l^9ye+#Xw-n?gh^hzF)QS@nY@wI zn>%Pw9i_Ya(PJ2hiN7uYpP`Od-S54 zDkVp0dwcYVo_2%Zr zi}KWi3YoP|8*pO+VN6(5$6Gxn@jP9@kYhX*BE6_64FKTRgpsgM=Ja9-VvLDfSCRTb z@0b&6^!=f&VIMpCea(qdJ$DEClAc?4`r(X~ajGW2ZBM*rqQ7~L%SBHnau{|hPO=SD z8&yCjMy-34m2@viLc;`EMOpxB!{l{YQCfK$9TauTsIL`X z9#wY3=kvw!82=`pm8#01s!;~D$Owt4X!W0CP(#4<%=$pV9!)2=-_CjbW`qHR} z`X)8m;g}DVIBpM9opW%@1hH!Z@n&fT#oNch>OtR8`sT0L(3R&$-ZCmYXoT+-djk=( zq-2OQJ*SiWcP=^H`cM7}zJ#U_7n~8QMYQA$fwKyAmzKaWUzkJBq|mwm$vTJ_kkxdcHbX7-Adyy8g^r!3&fefnTfu*pvyOI5anwavU*YWJTv8y$Xtqv*&>55 zkeukC&i)WOt$H8i$E4g-u3gZ%k*jE>=&W1AajLd#R<)?QOR6eSg-R%kLXoz`sm^6~ zY^l3-rZWDjdDw5zuA&f2agoGUjA&i1cgd9(DYaNOHpQs4EMV77Gizbayj43&U57Q4 zhc=p|^??nnfmF)X7=0?{5l`7llN`e2rvc}x>`|4bhbRQ*Gdz?~)Hmk1I44YQc%bB2 z=$;g5?5t{wZuZN`)zxej-+{v$XTX`^e;t2Pn~uV!Gj4nk7RqO+_J)MBWY31C3z-sp z=BS#TvULJzsLqK6#{24oVsPxzYT-{Ig=eWZ-h>lZrT6y7FZW(lZI^O!kDL3W;f>rM z3p01{=OsdX`VGW-L1?Qw6f zKTxZcHy(}lhU3wA`zepEzI=Ko=GPaXQLwln-AS>)&)yp*6I~$S$D}W1^NUN5IoXWnM&bvcL%2Bnc=7KHRC3Db?6{tr? z4xRX&vUCG;fUNd;7F_A#gVt3iN@auSyfw=YD7cEu`6jFN2@apBU23}mE?-oh-Z5*w zN{iULw;F{;(Q|G~zjZ|Hxh=_DV(5ts#55AX6{7mNV5;Bs|2J4CN~WWa+!na zf-&qs$qVOtD4^eQ)rVYjm4OH8(k>|HsOY#cE-DXwLO^(M`M8L9h`F2k8>|a2fys& zNBzq9#Wsg3Z%;w=eMH5JG%x<@3s%&(`aOgGJ6)Z~N?MK&bMn1DxhQv5-E)yieR&t> zPNsJX%mWQ*PwKC2{5_$$and4x8S>jODpEd9Yfmi88Krwr zya&JbwsjH7BF9X$#zEiF-2-}V@yu3{!SO#fXXY}p(G#1q$6TpX-QLKhQTNh|>gIvU zN#JpAoynd$Wj$&We$D&*s(McM9XV59+8a#eJurvzu!~>GSCo!nhu_vnTWGW63SB&d z6Pi{zTRzCPH1!|JJyFy{O=8kQH3J@enTt8&eK7Ze*b)E7S z!Zk6?>-bJmLlibj15_iF;tl(Gq3fG?_<*x}!-J&0qz21-X77K@?Pje={|4e>iWXl) zE5T~LEbbTPD$Kk(BS@8FCX;jDkO{mBr|W|aVQy<|)u~Uyuf~AjI8&H3dwpcc-i=t~ zI-WV(-MG?)cfPtkaeWbk8fYWvtZrGA8yBHxxHFKV>m1gJ`Ic-}PR7jk5VQ$YU6EOx z5ORy^6~XTBIB@%L5{{!~d?iK+8 z+q)+|=zCJR;1j1+WrQyLyK*%vq-$2xuZsBkD@e4tAFElRQDtI$r+<}nH7i{@72Ly?EU518lFAlADhXuXPYEdnYk#A zK-H(U?c1Zh3DKbjMEGfpq)t~%KTc;|Ex-muhqz%;?%#=4s;_W zv`nnTfp(on)aY>qNET*~{$>ku|5s77wYXPL=cGLKm8temL@8LItGb1c$k0oeRtnx1 zosPwqXV3c1I-#raLa(vhzDnFaKn8OqV6FgW4?o&K6y+6r0hGYF{Xk|I-Rf|bE*ss)GOTnzd@UA6Fq|J$;px z38&=%qrMP8YK7Bs_Up*=;QbEApUmaVaDOVTL~oW1C#2!hcTRAsK0sS0;k&SGdS9Jq zdg$ip*X{{XJ;XfUNWZ68pFS6Abw*rSU-#fPz|G1eFn$~9c|jrhhMNRKxynL}DqmRQ zsgI-pY(J=5gk4hvwBIMMoHyXPf~$LEK>i6D&2-k29|KcN>Z`xJR&U=QwU|d>3B^cc zd-z9Ho5oOwD;zdpe#CIVwH`G==u1YtSB<(yojG%^E$_azuHgw79>l^RU}j(9&}K`_%%_yGJe5lGG#Ah`WyW62j8E@T)HldEyF5B!#$vAOJ6JxM-sfuq{~uuD JJiN_e008Aa$sPaz literal 0 HcmV?d00001