From 13f78e24b82d70eeae1495ffd5f7b7389db715a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl=20=C3=85stholm?= Date: Mon, 18 Dec 2023 22:55:46 +0100 Subject: [PATCH 1/2] Update `ArgIterator` on Windows to follow standard Windows parsing rules This adds `ArgIteratorWindows`, which faithfully replicates the quoting and escaping behavior observed in `CommandLineToArgvW` and should make Zig applications play better with processes that abuse these quirks. --- lib/std/process.zig | 366 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 360 insertions(+), 6 deletions(-) diff --git a/lib/std/process.zig b/lib/std/process.zig index d736fbe8df..0ab8a30d96 100644 --- a/lib/std/process.zig +++ b/lib/std/process.zig @@ -522,6 +522,236 @@ pub const ArgIteratorWasi = struct { } }; +/// Iterator that implements the Windows command-line parsing algorithm. +/// +/// This iterator faithfully implements the parsing behavior observed in `CommandLineToArgvW` with +/// one exception: if the command-line string is empty, the iterator will immediately complete +/// without returning any arguments (whereas `CommandLineArgvW` will return a single argument +/// representing the name of the current executable). +pub const ArgIteratorWindows = struct { + allocator: Allocator, + /// Owned by the iterator. + cmd_line: []const u8, + index: usize = 0, + /// Owned by the iterator. Long enough to hold the entire `cmd_line` plus a null terminator. + buffer: []u8, + start: usize = 0, + end: usize = 0, + + pub const InitError = error{ OutOfMemory, InvalidCmdLine }; + + /// `cmd_line_w` *must* be an UTF16-LE-encoded string. + /// + /// The iterator makes a copy of `cmd_line_w` converted UTF-8 and keeps it; it does *not* take + /// ownership of `cmd_line_w`. + pub fn init(allocator: Allocator, cmd_line_w: [*:0]const u16) InitError!ArgIteratorWindows { + const cmd_line = std.unicode.utf16leToUtf8Alloc(allocator, mem.sliceTo(cmd_line_w, 0)) catch |err| switch (err) { + error.DanglingSurrogateHalf, + error.ExpectedSecondSurrogateHalf, + error.UnexpectedSecondSurrogateHalf, + => return error.InvalidCmdLine, + error.OutOfMemory => return error.OutOfMemory, + }; + errdefer allocator.free(cmd_line); + + const buffer = try allocator.alloc(u8, cmd_line.len + 1); + errdefer allocator.free(buffer); + + return .{ + .allocator = allocator, + .cmd_line = cmd_line, + .buffer = buffer, + }; + } + + /// Returns the next argument and advances the iterator. Returns `null` if at the end of the + /// command-line string. The iterator owns the returned slice. + pub fn next(self: *ArgIteratorWindows) ?[:0]const u8 { + return self.nextWithStrategy(next_strategy); + } + + /// Skips the next argument and advances the iterator. Returns `true` if an argument was + /// skipped, `false` if at the end of the command-line string. + pub fn skip(self: *ArgIteratorWindows) bool { + return self.nextWithStrategy(skip_strategy); + } + + const next_strategy = struct { + const T = ?[:0]const u8; + + const eof = null; + + fn emitBackslashes(self: *ArgIteratorWindows, count: usize) void { + for (0..count) |_| emitCharacter(self, '\\'); + } + + fn emitCharacter(self: *ArgIteratorWindows, char: u8) void { + self.buffer[self.end] = char; + self.end += 1; + } + + fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 { + self.buffer[self.end] = 0; + const arg = self.buffer[self.start..self.end :0]; + self.end += 1; + self.start = self.end; + return arg; + } + }; + + const skip_strategy = struct { + const T = bool; + + const eof = false; + + fn emitBackslashes(_: *ArgIteratorWindows, _: usize) void {} + + fn emitCharacter(_: *ArgIteratorWindows, _: u8) void {} + + fn yieldArg(_: *ArgIteratorWindows) bool { + return true; + } + }; + + // The essential parts of the algorithm are described in Microsoft's documentation: + // + // - + // - + // + // David Deley explains some additional undocumented quirks in great detail: + // + // - + // + // Code points <= U+0020 terminating an unquoted first argument was discovered independently by + // testing and observing the behavior of 'CommandLineToArgvW' on Windows 10. + + fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T { + // The first argument (the executable name) uses different parsing rules. + if (self.index == 0) { + var char = if (self.cmd_line.len != 0) self.cmd_line[0] else 0; + switch (char) { + 0 => { + // Immediately complete the iterator. + // 'CommandLineToArgvW' would return the name of the current executable here. + return strategy.eof; + }, + '"' => { + // If the first character is a quote, read everything until the next quote (then + // skip that quote), or until the end of the string. + self.index += 1; + while (true) : (self.index += 1) { + char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0; + switch (char) { + 0 => { + return strategy.yieldArg(self); + }, + '"' => { + self.index += 1; + return strategy.yieldArg(self); + }, + else => { + strategy.emitCharacter(self, char); + }, + } + } + }, + else => { + // Otherwise, read everything until the next space or ASCII control character + // (not including DEL) (then skip that character), or until the end of the + // string. This means that if the command-line string starts with one of these + // characters, the first returned argument will be the empty string. + while (true) : (self.index += 1) { + char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0; + switch (char) { + 0 => { + return strategy.yieldArg(self); + }, + '\x01'...' ' => { + self.index += 1; + return strategy.yieldArg(self); + }, + else => { + strategy.emitCharacter(self, char); + }, + } + } + }, + } + } + + // Skip spaces and tabs. The iterator completes if we reach the end of the string here. + while (true) : (self.index += 1) { + const char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0; + switch (char) { + 0 => return strategy.eof, + ' ', '\t' => continue, + else => break, + } + } + + // Parsing rules for subsequent arguments: + // + // - The end of the string always terminates the current argument. + // - When not in 'inside_quotes' mode, a space or tab terminates the current argument. + // - 2n backslashes followed by a quote emit n backslashes. If in 'inside_quotes' and the + // quote is immediately followed by a second quote, one quote is emitted and the other is + // skipped, otherwise, the quote is skipped. Finally, 'inside_quotes' is toggled. + // - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote. + // - n backslashes not followed by a quote emit n backslashes. + var backslash_count: usize = 0; + var inside_quotes = false; + while (true) : (self.index += 1) { + const char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0; + switch (char) { + 0 => { + strategy.emitBackslashes(self, backslash_count); + return strategy.yieldArg(self); + }, + ' ', '\t' => { + strategy.emitBackslashes(self, backslash_count); + backslash_count = 0; + if (inside_quotes) + strategy.emitCharacter(self, char) + else + return strategy.yieldArg(self); + }, + '"' => { + const char_is_escaped_quote = backslash_count % 2 != 0; + strategy.emitBackslashes(self, backslash_count / 2); + backslash_count = 0; + if (char_is_escaped_quote) { + strategy.emitCharacter(self, '"'); + } else { + if (inside_quotes and + self.index + 1 != self.cmd_line.len and + self.cmd_line[self.index + 1] == '"') + { + strategy.emitCharacter(self, '"'); + self.index += 1; + } + inside_quotes = !inside_quotes; + } + }, + '\\' => { + backslash_count += 1; + }, + else => { + strategy.emitBackslashes(self, backslash_count); + backslash_count = 0; + strategy.emitCharacter(self, char); + }, + } + } + } + + /// Frees the iterator's copy of the command-line string and all previously returned + /// argument slices. + pub fn deinit(self: *ArgIteratorWindows) void { + self.allocator.free(self.buffer); + self.allocator.free(self.cmd_line); + } +}; + /// Optional parameters for `ArgIteratorGeneral` pub const ArgIteratorGeneralOptions = struct { comments: bool = false, @@ -754,7 +984,7 @@ pub fn ArgIteratorGeneral(comptime options: ArgIteratorGeneralOptions) type { /// Cross-platform command line argument iterator. pub const ArgIterator = struct { const InnerType = switch (builtin.os.tag) { - .windows => ArgIteratorGeneral(.{}), + .windows => ArgIteratorWindows, .wasi => if (builtin.link_libc) ArgIteratorPosix else ArgIteratorWasi, else => ArgIteratorPosix, }; @@ -774,10 +1004,7 @@ pub const ArgIterator = struct { return ArgIterator{ .inner = InnerType.init() }; } - pub const InitError = switch (builtin.os.tag) { - .windows => InnerType.InitUtf16leError, - else => InnerType.InitError, - }; + pub const InitError = InnerType.InitError; /// You must deinitialize iterator's internal buffers by calling `deinit` when done. pub fn initWithAllocator(allocator: Allocator) InitError!ArgIterator { @@ -786,7 +1013,7 @@ pub const ArgIterator = struct { } if (builtin.os.tag == .windows) { const cmd_line_w = os.windows.kernel32.GetCommandLineW(); - return ArgIterator{ .inner = try InnerType.initUtf16le(allocator, cmd_line_w) }; + return ArgIterator{ .inner = try InnerType.init(allocator, cmd_line_w) }; } return ArgIterator{ .inner = InnerType.init() }; @@ -877,6 +1104,133 @@ pub fn argsFree(allocator: Allocator, args_alloc: []const [:0]u8) void { return allocator.free(aligned_allocated_buf); } +test "ArgIteratorWindows" { + const t = testArgIteratorWindows; + + try t( + \\"C:\Program Files\zig\zig.exe" run .\src\main.zig -target x86_64-windows-gnu -O ReleaseSafe -- --emoji=🗿 --eval="new Regex(\"Dwayne \\\"The Rock\\\" Johnson\")" + , &.{ + \\C:\Program Files\zig\zig.exe + , + \\run + , + \\.\src\main.zig + , + \\-target + , + \\x86_64-windows-gnu + , + \\-O + , + \\ReleaseSafe + , + \\-- + , + \\--emoji=🗿 + , + \\--eval=new Regex("Dwayne \"The Rock\" Johnson") + , + }); + + // Empty + try t("", &.{}); + + // Separators + try t("aa bb cc", &.{ "aa", "bb", "cc" }); + try t("aa\tbb\tcc", &.{ "aa", "bb", "cc" }); + try t("aa\nbb\ncc", &.{ "aa", "bb\ncc" }); + try t("aa\r\nbb\r\ncc", &.{ "aa", "\nbb\r\ncc" }); + try t("aa\rbb\rcc", &.{ "aa", "bb\rcc" }); + try t("aa\x07bb\x07cc", &.{ "aa", "bb\x07cc" }); + try t("aa\x7Fbb\x7Fcc", &.{"aa\x7Fbb\x7Fcc"}); + try t("aa🦎bb🦎cc", &.{"aa🦎bb🦎cc"}); + + // Leading/trailing whitespace + try t(" ", &.{""}); + try t(" aa bb ", &.{ "", "aa", "bb" }); + try t("\t\t", &.{""}); + try t("\t\taa\t\tbb\t\t", &.{ "", "aa", "bb" }); + try t("\n\n", &.{ "", "\n" }); + try t("\n\naa\n\nbb\n\n", &.{ "", "\naa\n\nbb\n\n" }); + + // Executable name with quotes/backslashes + try t("\"aa bb\tcc\ndd\"", &.{"aa bb\tcc\ndd"}); + try t("\"", &.{""}); + try t("\"\"", &.{""}); + try t("\"\"\"", &.{ "", "" }); + try t("\"\"\"\"", &.{ "", "" }); + try t("\"\"\"\"\"", &.{ "", "\"" }); + try t("aa\"bb\"cc\"dd", &.{"aa\"bb\"cc\"dd"}); + try t("aa\"bb cc\"dd", &.{ "aa\"bb", "ccdd" }); + try t("\"aa\\\"bb\"", &.{ "aa\\", "bb" }); + try t("\"aa\\\\\"", &.{"aa\\\\"}); + try t("aa\\\"bb", &.{"aa\\\"bb"}); + try t("aa\\\\\"bb", &.{"aa\\\\\"bb"}); + + // Arguments with quotes/backslashes + try t(". \"aa bb\tcc\ndd\"", &.{ ".", "aa bb\tcc\ndd" }); + try t(". aa\" \"bb\"\t\"cc\"\n\"dd\"", &.{ ".", "aa bb\tcc\ndd" }); + try t(". ", &.{"."}); + try t(". \"", &.{ ".", "" }); + try t(". \"\"", &.{ ".", "" }); + try t(". \"\"\"", &.{ ".", "\"" }); + try t(". \"\"\"\"", &.{ ".", "\"" }); + try t(". \"\"\"\"\"", &.{ ".", "\"" }); + try t(". \"\"\"\"\"\"", &.{ ".", "\"\"" }); + try t(". \" \"", &.{ ".", " " }); + try t(". \" \"\"", &.{ ".", " \"" }); + try t(". \" \"\"\"", &.{ ".", " \"" }); + try t(". \" \"\"\"\"", &.{ ".", " \"" }); + try t(". \" \"\"\"\"\"", &.{ ".", " \"\"" }); + try t(". \" \"\"\"\"\"\"", &.{ ".", " \"\"" }); + try t(". \\\"", &.{ ".", "\"" }); + try t(". \\\"\"", &.{ ".", "\"" }); + try t(". \\\"\"\"", &.{ ".", "\"" }); + try t(". \\\"\"\"\"", &.{ ".", "\"\"" }); + try t(". \\\"\"\"\"\"", &.{ ".", "\"\"" }); + try t(". \\\"\"\"\"\"\"", &.{ ".", "\"\"" }); + try t(". \" \\\"", &.{ ".", " \"" }); + try t(". \" \\\"\"", &.{ ".", " \"" }); + try t(". \" \\\"\"\"", &.{ ".", " \"\"" }); + try t(". \" \\\"\"\"\"", &.{ ".", " \"\"" }); + try t(". \" \\\"\"\"\"\"", &.{ ".", " \"\"" }); + try t(". \" \\\"\"\"\"\"\"", &.{ ".", " \"\"\"" }); + try t(". aa\\bb\\\\cc\\\\\\dd", &.{ ".", "aa\\bb\\\\cc\\\\\\dd" }); + try t(". \\\\\\\"aa bb\"", &.{ ".", "\\\"aa", "bb" }); + try t(". \\\\\\\\\"aa bb\"", &.{ ".", "\\\\aa bb" }); +} + +fn testArgIteratorWindows(cmd_line: []const u8, expected_args: []const []const u8) !void { + const cmd_line_w = try std.unicode.utf8ToUtf16LeWithNull(testing.allocator, cmd_line); + defer testing.allocator.free(cmd_line_w); + + // next + { + var it = try ArgIteratorWindows.init(testing.allocator, cmd_line_w); + defer it.deinit(); + + for (expected_args) |expected| { + if (it.next()) |actual| { + try testing.expectEqualStrings(expected, actual); + } else { + return error.TestUnexpectedResult; + } + } + try testing.expect(it.next() == null); + } + + // skip + { + var it = try ArgIteratorWindows.init(testing.allocator, cmd_line_w); + defer it.deinit(); + + for (0..expected_args.len) |_| { + try testing.expect(it.skip()); + } + try testing.expect(!it.skip()); + } +} + test "general arg parsing" { try testGeneralCmdLine("a b\tc d", &.{ "a", "b", "c", "d" }); try testGeneralCmdLine("\"abc\" d e", &.{ "abc", "d", "e" }); From 4d9c4ab82c61a5c6f6e120924941d2b22fd15dc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl=20=C3=85stholm?= Date: Mon, 18 Dec 2023 23:08:27 +0100 Subject: [PATCH 2/2] More accurate argv-to-command-line serialization when spawning child processes on Windows The old implementation had a bug in it in that it didn't quote empty strings, but it also didn't properly follow the special quoting rules required for the first argument (the executable name). This new implementation serializes the argv correctly such that it can be parsed by the `CommandLineToArgvW` algorithm. --- lib/std/child_process.zig | 181 +++++++++++++++++++++++++++++++------- 1 file changed, 151 insertions(+), 30 deletions(-) diff --git a/lib/std/child_process.zig b/lib/std/child_process.zig index ccf3c2428c..444adebbdc 100644 --- a/lib/std/child_process.zig +++ b/lib/std/child_process.zig @@ -744,9 +744,6 @@ pub const ChildProcess = struct { windowsDestroyPipe(g_hChildStd_ERR_Rd, g_hChildStd_ERR_Wr); }; - const cmd_line = try windowsCreateCommandLine(self.allocator, self.argv); - defer self.allocator.free(cmd_line); - var siStartInfo = windows.STARTUPINFOW{ .cb = @sizeOf(windows.STARTUPINFOW), .hStdError = g_hChildStd_ERR_Wr, @@ -818,7 +815,11 @@ pub const ChildProcess = struct { const app_name_w = try unicode.utf8ToUtf16LeWithNull(self.allocator, app_basename_utf8); defer self.allocator.free(app_name_w); - const cmd_line_w = try unicode.utf8ToUtf16LeWithNull(self.allocator, cmd_line); + const cmd_line_w = argvToCommandLineWindows(self.allocator, self.argv) catch |err| switch (err) { + // argv[0] contains unsupported characters that will never resolve to a valid exe. + error.InvalidArg0 => return error.FileNotFound, + else => |e| return e, + }; defer self.allocator.free(cmd_line_w); run: { @@ -1236,39 +1237,159 @@ test "windowsCreateProcessSupportsExtension" { try std.testing.expect(windowsCreateProcessSupportsExtension(&[_]u16{ '.', 'e', 'X', 'e', 'c' }) == null); } -/// Caller must dealloc. -fn windowsCreateCommandLine(allocator: mem.Allocator, argv: []const []const u8) ![:0]u8 { +pub const ArgvToCommandLineError = error{ OutOfMemory, InvalidUtf8, InvalidArg0 }; + +/// Serializes `argv` to a Windows command-line string suitable for passing to a child process and +/// parsing by the `CommandLineToArgvW` algorithm. The caller owns the returned slice. +pub fn argvToCommandLineWindows( + allocator: mem.Allocator, + argv: []const []const u8, +) ArgvToCommandLineError![:0]u16 { var buf = std.ArrayList(u8).init(allocator); defer buf.deinit(); - for (argv, 0..) |arg, arg_i| { - if (arg_i != 0) try buf.append(' '); - if (mem.indexOfAny(u8, arg, " \t\n\"") == null) { - try buf.appendSlice(arg); - continue; - } - try buf.append('"'); - var backslash_count: usize = 0; - for (arg) |byte| { - switch (byte) { - '\\' => backslash_count += 1, - '"' => { - try buf.appendNTimes('\\', backslash_count * 2 + 1); - try buf.append('"'); - backslash_count = 0; - }, - else => { - try buf.appendNTimes('\\', backslash_count); - try buf.append(byte); - backslash_count = 0; - }, + if (argv.len != 0) { + const arg0 = argv[0]; + + // The first argument must be quoted if it contains spaces or ASCII control characters + // (excluding DEL). It also follows special quoting rules where backslashes have no special + // interpretation, which makes it impossible to pass certain first arguments containing + // double quotes to a child process without characters from the first argument leaking into + // subsequent ones (which could have security implications). + // + // Empty arguments technically don't need quotes, but we quote them anyway for maximum + // compatibility with different implementations of the 'CommandLineToArgvW' algorithm. + // + // Double quotes are illegal in paths on Windows, so for the sake of simplicity we reject + // all first arguments containing double quotes, even ones that we could theoretically + // serialize in unquoted form. + var needs_quotes = arg0.len == 0; + for (arg0) |c| { + if (c <= ' ') { + needs_quotes = true; + } else if (c == '"') { + return error.InvalidArg0; } } - try buf.appendNTimes('\\', backslash_count * 2); - try buf.append('"'); + if (needs_quotes) { + try buf.append('"'); + try buf.appendSlice(arg0); + try buf.append('"'); + } else { + try buf.appendSlice(arg0); + } + + for (argv[1..]) |arg| { + try buf.append(' '); + + // Subsequent arguments must be quoted if they contain spaces, tabs or double quotes, + // or if they are empty. For simplicity and for maximum compatibility with different + // implementations of the 'CommandLineToArgvW' algorithm, we also quote all ASCII + // control characters (again, excluding DEL). + needs_quotes = for (arg) |c| { + if (c <= ' ' or c == '"') { + break true; + } + } else arg.len == 0; + if (!needs_quotes) { + try buf.appendSlice(arg); + continue; + } + + try buf.append('"'); + var backslash_count: usize = 0; + for (arg) |byte| { + switch (byte) { + '\\' => { + backslash_count += 1; + }, + '"' => { + try buf.appendNTimes('\\', backslash_count * 2 + 1); + try buf.append('"'); + backslash_count = 0; + }, + else => { + try buf.appendNTimes('\\', backslash_count); + try buf.append(byte); + backslash_count = 0; + }, + } + } + try buf.appendNTimes('\\', backslash_count * 2); + try buf.append('"'); + } } - return buf.toOwnedSliceSentinel(0); + return try unicode.utf8ToUtf16LeWithNull(allocator, buf.items); +} + +test "argvToCommandLineWindows" { + const t = testArgvToCommandLineWindows; + + try t(&.{ + \\C:\Program Files\zig\zig.exe + , + \\run + , + \\.\src\main.zig + , + \\-target + , + \\x86_64-windows-gnu + , + \\-O + , + \\ReleaseSafe + , + \\-- + , + \\--emoji=🗿 + , + \\--eval=new Regex("Dwayne \"The Rock\" Johnson") + , + }, + \\"C:\Program Files\zig\zig.exe" run .\src\main.zig -target x86_64-windows-gnu -O ReleaseSafe -- --emoji=🗿 "--eval=new Regex(\"Dwayne \\\"The Rock\\\" Johnson\")" + ); + + try t(&.{}, ""); + try t(&.{""}, "\"\""); + try t(&.{" "}, "\" \""); + try t(&.{"\t"}, "\"\t\""); + try t(&.{"\x07"}, "\"\x07\""); + try t(&.{"🦎"}, "🦎"); + + try t( + &.{ "zig", "aa aa", "bb\tbb", "cc\ncc", "dd\r\ndd", "ee\x7Fee" }, + "zig \"aa aa\" \"bb\tbb\" \"cc\ncc\" \"dd\r\ndd\" ee\x7Fee", + ); + + try t( + &.{ "\\\\foo bar\\foo bar\\", "\\\\zig zag\\zig zag\\" }, + "\"\\\\foo bar\\foo bar\\\" \"\\\\zig zag\\zig zag\\\\\"", + ); + + try std.testing.expectError( + error.InvalidArg0, + argvToCommandLineWindows(std.testing.allocator, &.{"\"quotes\"quotes\""}), + ); + try std.testing.expectError( + error.InvalidArg0, + argvToCommandLineWindows(std.testing.allocator, &.{"quotes\"quotes"}), + ); + try std.testing.expectError( + error.InvalidArg0, + argvToCommandLineWindows(std.testing.allocator, &.{"q u o t e s \" q u o t e s"}), + ); +} + +fn testArgvToCommandLineWindows(argv: []const []const u8, expected_cmd_line: []const u8) !void { + const cmd_line_w = try argvToCommandLineWindows(std.testing.allocator, argv); + defer std.testing.allocator.free(cmd_line_w); + + const cmd_line = try unicode.utf16leToUtf8Alloc(std.testing.allocator, cmd_line_w); + defer std.testing.allocator.free(cmd_line); + + try std.testing.expectEqualStrings(expected_cmd_line, cmd_line); } fn windowsDestroyPipe(rd: ?windows.HANDLE, wr: ?windows.HANDLE) void {