Merge pull request #19655 from squeek502/windows-argv-post-2008

ArgIteratorWindows: Match post-2008 C runtime rather than `CommandLineToArgvW`
2026-01-20 22:35:24 +00:00 · 2024-04-15 15:28:33 -07:00 · 2024-04-15 15:28:33 -07:00 · b78b2689ed
commit b78b2689ed
parent ff18103ef6 cffe1999c6
8 changed files with 502 additions and 84 deletions
--- a/lib/std/process.zig
+++ b/lib/std/process.zig
@ -625,11 +625,22 @@ pub const ArgIteratorWasi = struct {
 };

 /// Iterator that implements the Windows command-line parsing algorithm.
+/// The implementation is intended to be compatible with the post-2008 C runtime,
+/// but is *not* intended to be compatible with `CommandLineToArgvW` since
+/// `CommandLineToArgvW` uses the pre-2008 parsing rules.
 ///
-/// This iterator faithfully implements the parsing behavior observed in `CommandLineToArgvW` with
+/// This iterator faithfully implements the parsing behavior observed from the C runtime with
 /// one exception: if the command-line string is empty, the iterator will immediately complete
-/// without returning any arguments (whereas `CommandLineArgvW` will return a single argument
+/// without returning any arguments (whereas the C runtime will return a single argument
 /// representing the name of the current executable).
+///
+/// The essential parts of the algorithm are described in Microsoft's documentation:
+///
+/// - https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments
+///
+/// David Deley explains some additional undocumented quirks in great detail:
+///
+/// - https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES
 pub const ArgIteratorWindows = struct {
    allocator: Allocator,
    /// Owned by the iterator.
@ -686,6 +697,51 @@ pub const ArgIteratorWindows = struct {
        fn emitCharacter(self: *ArgIteratorWindows, char: u8) void {
            self.buffer[self.end] = char;
            self.end += 1;
+
+            // Because we are emitting WTF-8 byte-by-byte, we need to
+            // check to see if we've emitted two consecutive surrogate
+            // codepoints that form a valid surrogate pair in order
+            // to ensure that we're always emitting well-formed WTF-8
+            // (https://simonsapin.github.io/wtf-8/#concatenating).
+            //
+            // If we do have a valid surrogate pair, we need to emit
+            // the UTF-8 sequence for the codepoint that they encode
+            // instead of the WTF-8 encoding for the two surrogate pairs
+            // separately.
+            //
+            // This is relevant when dealing with a WTF-16 encoded
+            // command line like this:
+            // "<0xD801>"<0xDC37>
+            // which would get converted to WTF-8 in `cmd_line` as:
+            // "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7>
+            // and then after parsing it'd naively get emitted as:
+            // <0xED><0xA0><0x81><0xED><0xB0><0xB7>
+            // but instead, we need to recognize the surrogate pair
+            // and emit the codepoint it encodes, which in this
+            // example is U+10437 (𐐷), which is encoded in UTF-8 as:
+            // <0xF0><0x90><0x90><0xB7>
+            concatSurrogatePair(self);
+        }
+
+        fn concatSurrogatePair(self: *ArgIteratorWindows) void {
+            // Surrogate codepoints are always encoded as 3 bytes, so there
+            // must be 6 bytes for a surrogate pair to exist.
+            if (self.end - self.start >= 6) {
+                const window = self.buffer[self.end - 6 .. self.end];
+                const view = std.unicode.Wtf8View.init(window) catch return;
+                var it = view.iterator();
+                var pair: [2]u16 = undefined;
+                pair[0] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
+                if (!std.unicode.utf16IsHighSurrogate(std.mem.littleToNative(u16, pair[0]))) return;
+                pair[1] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
+                if (!std.unicode.utf16IsLowSurrogate(std.mem.littleToNative(u16, pair[1]))) return;
+                // We know we have a valid surrogate pair, so convert
+                // it to UTF-8, overwriting the surrogate pair's bytes
+                // and then chop off the extra bytes.
+                const len = std.unicode.utf16LeToUtf8(window, &pair) catch unreachable;
+                const delta = 6 - len;
+                self.end -= delta;
+            }
        }

        fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 {
@ -711,69 +767,37 @@ pub const ArgIteratorWindows = struct {
        }
    };

-    // The essential parts of the algorithm are described in Microsoft's documentation:
-    //
-    // - <https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments>
-    // - <https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw>
-    //
-    // David Deley explains some additional undocumented quirks in great detail:
-    //
-    // - <https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES>
-    //
-    // Code points <= U+0020 terminating an unquoted first argument was discovered independently by
-    // testing and observing the behavior of 'CommandLineToArgvW' on Windows 10.
-
    fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T {
        // The first argument (the executable name) uses different parsing rules.
        if (self.index == 0) {
-            var char = if (self.cmd_line.len != 0) self.cmd_line[0] else 0;
-            switch (char) {
-                0 => {
-                    // Immediately complete the iterator.
-                    // 'CommandLineToArgvW' would return the name of the current executable here.
-                    return strategy.eof;
-                },
-                '"' => {
-                    // If the first character is a quote, read everything until the next quote (then
-                    // skip that quote), or until the end of the string.
-                    self.index += 1;
-                    while (true) : (self.index += 1) {
-                        char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0;
-                        switch (char) {
-                            0 => {
-                                return strategy.yieldArg(self);
-                            },
-                            '"' => {
-                                self.index += 1;
-                                return strategy.yieldArg(self);
-                            },
-                            else => {
-                                strategy.emitCharacter(self, char);
-                            },
+            if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) {
+                // Immediately complete the iterator.
+                // The C runtime would return the name of the current executable here.
+                return strategy.eof;
+            }
+
+            var inside_quotes = false;
+            while (true) : (self.index += 1) {
+                const char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0;
+                switch (char) {
+                    0 => {
+                        return strategy.yieldArg(self);
+                    },
+                    '"' => {
+                        inside_quotes = !inside_quotes;
+                    },
+                    ' ', '\t' => {
+                        if (inside_quotes)
+                            strategy.emitCharacter(self, char)
+                        else {
+                            self.index += 1;
+                            return strategy.yieldArg(self);
                        }
-                    }
-                },
-                else => {
-                    // Otherwise, read everything until the next space or ASCII control character
-                    // (not including DEL) (then skip that character), or until the end of the
-                    // string. This means that if the command-line string starts with one of these
-                    // characters, the first returned argument will be the empty string.
-                    while (true) : (self.index += 1) {
-                        char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0;
-                        switch (char) {
-                            0 => {
-                                return strategy.yieldArg(self);
-                            },
-                            '\x01'...' ' => {
-                                self.index += 1;
-                                return strategy.yieldArg(self);
-                            },
-                            else => {
-                                strategy.emitCharacter(self, char);
-                            },
-                        }
-                    }
-                },
+                    },
+                    else => {
+                        strategy.emitCharacter(self, char);
+                    },
+                }
            }
        }

@ -791,9 +815,10 @@ pub const ArgIteratorWindows = struct {
        //
        // - The end of the string always terminates the current argument.
        // - When not in 'inside_quotes' mode, a space or tab terminates the current argument.
-        // - 2n backslashes followed by a quote emit n backslashes. If in 'inside_quotes' and the
-        //   quote is immediately followed by a second quote, one quote is emitted and the other is
-        //   skipped, otherwise, the quote is skipped. Finally, 'inside_quotes' is toggled.
+        // - 2n backslashes followed by a quote emit n backslashes (note: n can be zero).
+        //   If in 'inside_quotes' and the quote is immediately followed by a second quote,
+        //   one quote is emitted and the other is skipped, otherwise, the quote is skipped
+        //   and 'inside_quotes' is toggled.
        // - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote.
        // - n backslashes not followed by a quote emit n backslashes.
        var backslash_count: usize = 0;
@ -826,8 +851,9 @@ pub const ArgIteratorWindows = struct {
                        {
                            strategy.emitCharacter(self, '"');
                            self.index += 1;
+                        } else {
+                            inside_quotes = !inside_quotes;
                        }
-                        inside_quotes = !inside_quotes;
                    }
                },
                '\\' => {
@ -1215,10 +1241,10 @@ test ArgIteratorWindows {
    // Separators
    try t("aa bb cc", &.{ "aa", "bb", "cc" });
    try t("aa\tbb\tcc", &.{ "aa", "bb", "cc" });
-    try t("aa\nbb\ncc", &.{ "aa", "bb\ncc" });
-    try t("aa\r\nbb\r\ncc", &.{ "aa", "\nbb\r\ncc" });
-    try t("aa\rbb\rcc", &.{ "aa", "bb\rcc" });
-    try t("aa\x07bb\x07cc", &.{ "aa", "bb\x07cc" });
+    try t("aa\nbb\ncc", &.{"aa\nbb\ncc"});
+    try t("aa\r\nbb\r\ncc", &.{"aa\r\nbb\r\ncc"});
+    try t("aa\rbb\rcc", &.{"aa\rbb\rcc"});
+    try t("aa\x07bb\x07cc", &.{"aa\x07bb\x07cc"});
    try t("aa\x7Fbb\x7Fcc", &.{"aa\x7Fbb\x7Fcc"});
    try t("aa🦎bb🦎cc", &.{"aa🦎bb🦎cc"});

@ -1227,22 +1253,22 @@ test ArgIteratorWindows {
    try t("  aa  bb  ", &.{ "", "aa", "bb" });
    try t("\t\t", &.{""});
    try t("\t\taa\t\tbb\t\t", &.{ "", "aa", "bb" });
-    try t("\n\n", &.{ "", "\n" });
-    try t("\n\naa\n\nbb\n\n", &.{ "", "\naa\n\nbb\n\n" });
+    try t("\n\n", &.{"\n\n"});
+    try t("\n\naa\n\nbb\n\n", &.{"\n\naa\n\nbb\n\n"});

    // Executable name with quotes/backslashes
    try t("\"aa bb\tcc\ndd\"", &.{"aa bb\tcc\ndd"});
    try t("\"", &.{""});
    try t("\"\"", &.{""});
-    try t("\"\"\"", &.{ "", "" });
-    try t("\"\"\"\"", &.{ "", "" });
-    try t("\"\"\"\"\"", &.{ "", "\"" });
-    try t("aa\"bb\"cc\"dd", &.{"aa\"bb\"cc\"dd"});
-    try t("aa\"bb cc\"dd", &.{ "aa\"bb", "ccdd" });
-    try t("\"aa\\\"bb\"", &.{ "aa\\", "bb" });
+    try t("\"\"\"", &.{""});
+    try t("\"\"\"\"", &.{""});
+    try t("\"\"\"\"\"", &.{""});
+    try t("aa\"bb\"cc\"dd", &.{"aabbccdd"});
+    try t("aa\"bb cc\"dd", &.{"aabb ccdd"});
+    try t("\"aa\\\"bb\"", &.{"aa\\bb"});
    try t("\"aa\\\\\"", &.{"aa\\\\"});
-    try t("aa\\\"bb", &.{"aa\\\"bb"});
-    try t("aa\\\\\"bb", &.{"aa\\\\\"bb"});
+    try t("aa\\\"bb", &.{"aa\\bb"});
+    try t("aa\\\\\"bb", &.{"aa\\\\bb"});

    // Arguments with quotes/backslashes
    try t(". \"aa bb\tcc\ndd\"", &.{ ".", "aa bb\tcc\ndd" });
@ -1252,29 +1278,66 @@ test ArgIteratorWindows {
    try t(". \"\"", &.{ ".", "" });
    try t(". \"\"\"", &.{ ".", "\"" });
    try t(". \"\"\"\"", &.{ ".", "\"" });
-    try t(". \"\"\"\"\"", &.{ ".", "\"" });
+    try t(". \"\"\"\"\"", &.{ ".", "\"\"" });
    try t(". \"\"\"\"\"\"", &.{ ".", "\"\"" });
    try t(". \" \"", &.{ ".", " " });
    try t(". \" \"\"", &.{ ".", " \"" });
    try t(". \" \"\"\"", &.{ ".", " \"" });
-    try t(". \" \"\"\"\"", &.{ ".", " \"" });
+    try t(". \" \"\"\"\"", &.{ ".", " \"\"" });
    try t(". \" \"\"\"\"\"", &.{ ".", " \"\"" });
-    try t(". \" \"\"\"\"\"\"", &.{ ".", " \"\"" });
+    try t(". \" \"\"\"\"\"\"", &.{ ".", " \"\"\"" });
    try t(". \\\"", &.{ ".", "\"" });
    try t(". \\\"\"", &.{ ".", "\"" });
    try t(". \\\"\"\"", &.{ ".", "\"" });
    try t(". \\\"\"\"\"", &.{ ".", "\"\"" });
    try t(". \\\"\"\"\"\"", &.{ ".", "\"\"" });
-    try t(". \\\"\"\"\"\"\"", &.{ ".", "\"\"" });
+    try t(". \\\"\"\"\"\"\"", &.{ ".", "\"\"\"" });
    try t(". \" \\\"", &.{ ".", " \"" });
    try t(". \" \\\"\"", &.{ ".", " \"" });
    try t(". \" \\\"\"\"", &.{ ".", " \"\"" });
    try t(". \" \\\"\"\"\"", &.{ ".", " \"\"" });
-    try t(". \" \\\"\"\"\"\"", &.{ ".", " \"\"" });
+    try t(". \" \\\"\"\"\"\"", &.{ ".", " \"\"\"" });
    try t(". \" \\\"\"\"\"\"\"", &.{ ".", " \"\"\"" });
    try t(". aa\\bb\\\\cc\\\\\\dd", &.{ ".", "aa\\bb\\\\cc\\\\\\dd" });
    try t(". \\\\\\\"aa bb\"", &.{ ".", "\\\"aa", "bb" });
    try t(". \\\\\\\\\"aa bb\"", &.{ ".", "\\\\aa bb" });
+
+    // From https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args#results-of-parsing-command-lines
+    try t(
+        \\foo.exe "abc" d e
+    , &.{ "foo.exe", "abc", "d", "e" });
+    try t(
+        \\foo.exe a\\b d"e f"g h
+    , &.{ "foo.exe", "a\\\\b", "de fg", "h" });
+    try t(
+        \\foo.exe a\\\"b c d
+    , &.{ "foo.exe", "a\\\"b", "c", "d" });
+    try t(
+        \\foo.exe a\\\\"b c" d e
+    , &.{ "foo.exe", "a\\\\b c", "d", "e" });
+    try t(
+        \\foo.exe a"b"" c d
+    , &.{ "foo.exe", "ab\" c d" });
+
+    // From https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX
+    try t("foo.exe CallMeIshmael", &.{ "foo.exe", "CallMeIshmael" });
+    try t("foo.exe \"Call Me Ishmael\"", &.{ "foo.exe", "Call Me Ishmael" });
+    try t("foo.exe Cal\"l Me I\"shmael", &.{ "foo.exe", "Call Me Ishmael" });
+    try t("foo.exe CallMe\\\"Ishmael", &.{ "foo.exe", "CallMe\"Ishmael" });
+    try t("foo.exe \"CallMe\\\"Ishmael\"", &.{ "foo.exe", "CallMe\"Ishmael" });
+    try t("foo.exe \"Call Me Ishmael\\\\\"", &.{ "foo.exe", "Call Me Ishmael\\" });
+    try t("foo.exe \"CallMe\\\\\\\"Ishmael\"", &.{ "foo.exe", "CallMe\\\"Ishmael" });
+    try t("foo.exe a\\\\\\b", &.{ "foo.exe", "a\\\\\\b" });
+    try t("foo.exe \"a\\\\\\b\"", &.{ "foo.exe", "a\\\\\\b" });
+
+    // Surrogate pair encoding of 𐐷 separated by quotes.
+    // Encoded as WTF-16:
+    // "<0xD801>"<0xDC37>
+    // Encoded as WTF-8:
+    // "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7>
+    // During parsing, the quotes drop out and the surrogate pair
+    // should end up encoded as its normal UTF-8 representation.
+    try t("foo.exe \"\xed\xa0\x81\"\xed\xb0\xb7", &.{ "foo.exe", "𐐷" });
 }

 fn testArgIteratorWindows(cmd_line: []const u8, expected_args: []const []const u8) !void {
--- a/test/standalone/build.zig.zon
+++ b/test/standalone/build.zig.zon
@ -104,6 +104,9 @@
        .windows_spawn = .{
            .path = "windows_spawn",
        },
+        .windows_argv = .{
+            .path = "windows_argv",
+        },
        .self_exe_symlink = .{
            .path = "self_exe_symlink",
        },
--- a/test/standalone/windows_argv/README.md
+++ b/test/standalone/windows_argv/README.md
@ -0,0 +1,19 @@
+Tests that Zig's `std.process.ArgIteratorWindows` is compatible with both the MSVC and MinGW C runtimes' argv splitting algorithms.
+
+The method of testing is:
+- Compile a C file with `wmain` as its entry point
+- The C `wmain` calls a Zig-implemented `verify` function that takes the `argv` from `wmain` and compares it to the argv gotten from `std.proccess.argsAlloc` (which takes `kernel32.GetCommandLineW()` and splits it)
+- The compiled C program is spawned continuously as a child process by the implementation in `fuzz.zig` with randomly generated command lines
+  + On Windows, the 'application name' and the 'command line' are disjoint concepts. That is, you can spawn `foo.exe` but set the command line to `bar.exe`, and `CreateProcessW` will spawn `foo.exe` but `argv[0]` will be `bar.exe`. This quirk allows us to test arbitrary `argv[0]` values as well which otherwise wouldn't be possible.
+
+Note: This is intentionally testing against the C runtime argv splitting and *not* [`CommandLineToArgvW`](https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw), since the C runtime argv splitting was updated in 2008 but `CommandLineToArgvW` still uses the pre-2008 algorithm (which differs in both `argv[0]` rules and `""`; see [here](https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESDOC) for details)
+
+---
+
+In addition to being run during `zig build test-standalone`, this test can be run on its own via `zig build test` from within this directory.
+
+When run on its own:
+- `-Diterations=<num>` can be used to set the max fuzzing iterations, and `-Diterations=0` can be used to fuzz indefinitely
+- `-Dseed=<num>` can be used to set the PRNG seed for fuzz testing. If not provided, then the seed is chosen at random during `build.zig` compilation.
+
+On failure, the number of iterations and the seed can be seen in the failing command, e.g. in `path\to\fuzz.exe path\to\verify-msvc.exe 100 2780392459403250529`, the iterations is `100` and the seed is `2780392459403250529`.
--- a/test/standalone/windows_argv/build.zig
+++ b/test/standalone/windows_argv/build.zig
@ -0,0 +1,100 @@
+const std = @import("std");
+const builtin = @import("builtin");
+
+pub fn build(b: *std.Build) !void {
+    const test_step = b.step("test", "Test it");
+    b.default_step = test_step;
+
+    if (builtin.os.tag != .windows) return;
+
+    const optimize: std.builtin.OptimizeMode = .Debug;
+
+    const lib_gnu = b.addStaticLibrary(.{
+        .name = "toargv-gnu",
+        .root_source_file = .{ .path = "lib.zig" },
+        .target = b.resolveTargetQuery(.{
+            .abi = .gnu,
+        }),
+        .optimize = optimize,
+    });
+    const verify_gnu = b.addExecutable(.{
+        .name = "verify-gnu",
+        .target = b.resolveTargetQuery(.{
+            .abi = .gnu,
+        }),
+        .optimize = optimize,
+    });
+    verify_gnu.addCSourceFile(.{
+        .file = .{ .path = "verify.c" },
+        .flags = &.{ "-DUNICODE", "-D_UNICODE" },
+    });
+    verify_gnu.mingw_unicode_entry_point = true;
+    verify_gnu.linkLibrary(lib_gnu);
+    verify_gnu.linkLibC();
+
+    const fuzz = b.addExecutable(.{
+        .name = "fuzz",
+        .root_source_file = .{ .path = "fuzz.zig" },
+        .target = b.host,
+        .optimize = optimize,
+    });
+
+    const fuzz_max_iterations = b.option(u64, "iterations", "The max fuzz iterations (default: 100)") orelse 100;
+    const fuzz_iterations_arg = std.fmt.allocPrint(b.allocator, "{}", .{fuzz_max_iterations}) catch @panic("oom");
+
+    const fuzz_seed = b.option(u64, "seed", "Seed to use for the PRNG (default: random)") orelse seed: {
+        var buf: [8]u8 = undefined;
+        try std.posix.getrandom(&buf);
+        break :seed std.mem.readInt(u64, &buf, builtin.cpu.arch.endian());
+    };
+    const fuzz_seed_arg = std.fmt.allocPrint(b.allocator, "{}", .{fuzz_seed}) catch @panic("oom");
+
+    const run_gnu = b.addRunArtifact(fuzz);
+    run_gnu.setName("fuzz-gnu");
+    run_gnu.addArtifactArg(verify_gnu);
+    run_gnu.addArgs(&.{ fuzz_iterations_arg, fuzz_seed_arg });
+    run_gnu.expectExitCode(0);
+
+    test_step.dependOn(&run_gnu.step);
+
+    // Only target the MSVC ABI if MSVC/Windows SDK is available
+    const has_msvc = has_msvc: {
+        const sdk = std.zig.WindowsSdk.find(b.allocator) catch |err| switch (err) {
+            error.OutOfMemory => @panic("oom"),
+            else => break :has_msvc false,
+        };
+        defer sdk.free(b.allocator);
+        break :has_msvc true;
+    };
+    if (has_msvc) {
+        const lib_msvc = b.addStaticLibrary(.{
+            .name = "toargv-msvc",
+            .root_source_file = .{ .path = "lib.zig" },
+            .target = b.resolveTargetQuery(.{
+                .abi = .msvc,
+            }),
+            .optimize = optimize,
+        });
+        const verify_msvc = b.addExecutable(.{
+            .name = "verify-msvc",
+            .target = b.resolveTargetQuery(.{
+                .abi = .msvc,
+            }),
+            .optimize = optimize,
+        });
+        verify_msvc.addCSourceFile(.{
+            .file = .{ .path = "verify.c" },
+            .flags = &.{ "-DUNICODE", "-D_UNICODE" },
+        });
+        verify_msvc.linkLibrary(lib_msvc);
+        verify_msvc.linkLibC();
+
+        const run_msvc = b.addRunArtifact(fuzz);
+        run_msvc.setName("fuzz-msvc");
+        run_msvc.addArtifactArg(verify_msvc);
+        run_msvc.addArgs(&.{ fuzz_iterations_arg, fuzz_seed_arg });
+        run_msvc.expectExitCode(0);
+
+        test_step.dependOn(&run_msvc.step);
+    }
+}
--- a/test/standalone/windows_argv/fuzz.zig
+++ b/test/standalone/windows_argv/fuzz.zig
@ -0,0 +1,159 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const windows = std.os.windows;
+const Allocator = std.mem.Allocator;
+
+pub fn main() !void {
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer std.debug.assert(gpa.deinit() == .ok);
+    const allocator = gpa.allocator();
+
+    const args = try std.process.argsAlloc(allocator);
+    defer std.process.argsFree(allocator, args);
+
+    if (args.len < 2) return error.MissingArgs;
+
+    const verify_path_wtf8 = args[1];
+    const verify_path_w = try std.unicode.wtf8ToWtf16LeAllocZ(allocator, verify_path_wtf8);
+    defer allocator.free(verify_path_w);
+
+    const iterations: u64 = iterations: {
+        if (args.len < 3) break :iterations 0;
+        break :iterations try std.fmt.parseUnsigned(u64, args[2], 10);
+    };
+
+    var rand_seed = false;
+    const seed: u64 = seed: {
+        if (args.len < 4) {
+            rand_seed = true;
+            var buf: [8]u8 = undefined;
+            try std.posix.getrandom(&buf);
+            break :seed std.mem.readInt(u64, &buf, builtin.cpu.arch.endian());
+        }
+        break :seed try std.fmt.parseUnsigned(u64, args[3], 10);
+    };
+    var random = std.rand.DefaultPrng.init(seed);
+    const rand = random.random();
+
+    // If the seed was not given via the CLI, then output the
+    // randomly chosen seed so that this run can be reproduced
+    if (rand_seed) {
+        std.debug.print("rand seed: {}\n", .{seed});
+    }
+
+    var cmd_line_w_buf = std.ArrayList(u16).init(allocator);
+    defer cmd_line_w_buf.deinit();
+
+    var i: u64 = 0;
+    var errors: u64 = 0;
+    while (iterations == 0 or i < iterations) {
+        const cmd_line_w = try randomCommandLineW(allocator, rand);
+        defer allocator.free(cmd_line_w);
+
+        // avoid known difference for 0-length command lines
+        if (cmd_line_w.len == 0 or cmd_line_w[0] == '\x00') continue;
+
+        const exit_code = try spawnVerify(verify_path_w, cmd_line_w);
+        if (exit_code != 0) {
+            std.debug.print(">>> found discrepancy <<<\n", .{});
+            const cmd_line_wtf8 = try std.unicode.wtf16LeToWtf8Alloc(allocator, cmd_line_w);
+            defer allocator.free(cmd_line_wtf8);
+            std.debug.print("\"{}\"\n\n", .{std.zig.fmtEscapes(cmd_line_wtf8)});
+
+            errors += 1;
+        }
+
+        i += 1;
+    }
+    if (errors > 0) {
+        // we never get here if iterations is 0 so we don't have to worry about that case
+        std.debug.print("found {} discrepancies in {} iterations\n", .{ errors, iterations });
+        return error.FoundDiscrepancies;
+    }
+}
+
+fn randomCommandLineW(allocator: Allocator, rand: std.rand.Random) ![:0]const u16 {
+    const Choice = enum {
+        backslash,
+        quote,
+        space,
+        tab,
+        control,
+        printable,
+        non_ascii,
+    };
+
+    const choices = rand.uintAtMostBiased(u16, 256);
+    var buf = try std.ArrayList(u16).initCapacity(allocator, choices);
+    errdefer buf.deinit();
+
+    for (0..choices) |_| {
+        const choice = rand.enumValue(Choice);
+        const code_unit = switch (choice) {
+            .backslash => '\\',
+            .quote => '"',
+            .space => ' ',
+            .tab => '\t',
+            .control => switch (rand.uintAtMostBiased(u8, 0x21)) {
+                0x21 => '\x7F',
+                else => |b| b,
+            },
+            .printable => '!' + rand.uintAtMostBiased(u8, '~' - '!'),
+            .non_ascii => rand.intRangeAtMostBiased(u16, 0x80, 0xFFFF),
+        };
+        try buf.append(std.mem.nativeToLittle(u16, code_unit));
+    }
+
+    return buf.toOwnedSliceSentinel(0);
+}
+
+/// Returns the exit code of the verify process
+fn spawnVerify(verify_path: [:0]const u16, cmd_line: [:0]const u16) !windows.DWORD {
+    const child_proc = spawn: {
+        var startup_info: windows.STARTUPINFOW = .{
+            .cb = @sizeOf(windows.STARTUPINFOW),
+            .lpReserved = null,
+            .lpDesktop = null,
+            .lpTitle = null,
+            .dwX = 0,
+            .dwY = 0,
+            .dwXSize = 0,
+            .dwYSize = 0,
+            .dwXCountChars = 0,
+            .dwYCountChars = 0,
+            .dwFillAttribute = 0,
+            .dwFlags = windows.STARTF_USESTDHANDLES,
+            .wShowWindow = 0,
+            .cbReserved2 = 0,
+            .lpReserved2 = null,
+            .hStdInput = null,
+            .hStdOutput = null,
+            .hStdError = windows.GetStdHandle(windows.STD_ERROR_HANDLE) catch null,
+        };
+        var proc_info: windows.PROCESS_INFORMATION = undefined;
+
+        try windows.CreateProcessW(
+            @constCast(verify_path.ptr),
+            @constCast(cmd_line.ptr),
+            null,
+            null,
+            windows.TRUE,
+            0,
+            null,
+            null,
+            &startup_info,
+            &proc_info,
+        );
+        windows.CloseHandle(proc_info.hThread);
+
+        break :spawn proc_info.hProcess;
+    };
+    defer windows.CloseHandle(child_proc);
+    try windows.WaitForSingleObjectEx(child_proc, windows.INFINITE, false);
+
+    var exit_code: windows.DWORD = undefined;
+    if (windows.kernel32.GetExitCodeProcess(child_proc, &exit_code) == 0) {
+        return error.UnableToGetExitCode;
+    }
+    return exit_code;
+}
--- a/test/standalone/windows_argv/lib.h
+++ b/test/standalone/windows_argv/lib.h
@ -0,0 +1,8 @@
+#ifndef _LIB_H_
+#define _LIB_H_
+
+#include <windows.h>
+
+int verify(int argc, wchar_t *argv[]);
+
+#endif
--- a/test/standalone/windows_argv/lib.zig
+++ b/test/standalone/windows_argv/lib.zig
@ -0,0 +1,59 @@
+const std = @import("std");
+
+/// Returns 1 on success, 0 on failure
+export fn verify(argc: c_int, argv: [*]const [*:0]const u16) c_int {
+    const argv_slice = argv[0..@intCast(argc)];
+    testArgv(argv_slice) catch |err| switch (err) {
+        error.OutOfMemory => @panic("oom"),
+        error.Overflow => @panic("bytes needed to contain args would overflow usize"),
+        error.ArgvMismatch => return 0,
+    };
+    return 1;
+}
+
+fn testArgv(expected_args: []const [*:0]const u16) !void {
+    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_state.deinit();
+    const allocator = arena_state.allocator();
+
+    const args = try std.process.argsAlloc(allocator);
+    var wtf8_buf = std.ArrayList(u8).init(allocator);
+
+    var eql = true;
+    if (args.len != expected_args.len) eql = false;
+
+    const min_len = @min(expected_args.len, args.len);
+    for (expected_args[0..min_len], args[0..min_len], 0..) |expected_arg, arg_wtf8, i| {
+        wtf8_buf.clearRetainingCapacity();
+        try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(expected_arg));
+        if (!std.mem.eql(u8, wtf8_buf.items, arg_wtf8)) {
+            std.debug.print("{}: expected: \"{}\"\n", .{ i, std.zig.fmtEscapes(wtf8_buf.items) });
+            std.debug.print("{}:   actual: \"{}\"\n", .{ i, std.zig.fmtEscapes(arg_wtf8) });
+            eql = false;
+        }
+    }
+    if (!eql) {
+        for (expected_args[min_len..], min_len..) |arg, i| {
+            wtf8_buf.clearRetainingCapacity();
+            try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(arg));
+            std.debug.print("{}: expected: \"{}\"\n", .{ i, std.zig.fmtEscapes(wtf8_buf.items) });
+        }
+        for (args[min_len..], min_len..) |arg, i| {
+            std.debug.print("{}:   actual: \"{}\"\n", .{ i, std.zig.fmtEscapes(arg) });
+        }
+        const peb = std.os.windows.peb();
+        const lpCmdLine: [*:0]u16 = @ptrCast(peb.ProcessParameters.CommandLine.Buffer);
+        wtf8_buf.clearRetainingCapacity();
+        try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(lpCmdLine));
+        std.debug.print("command line: \"{}\"\n", .{std.zig.fmtEscapes(wtf8_buf.items)});
+        std.debug.print("expected argv:\n", .{});
+        std.debug.print("&.{{\n", .{});
+        for (expected_args) |arg| {
+            wtf8_buf.clearRetainingCapacity();
+            try std.unicode.wtf16LeToWtf8ArrayList(&wtf8_buf, std.mem.span(arg));
+            std.debug.print("    \"{}\",\n", .{std.zig.fmtEscapes(wtf8_buf.items)});
+        }
+        std.debug.print("}}\n", .{});
+        return error.ArgvMismatch;
+    }
+}
--- a/test/standalone/windows_argv/verify.c
+++ b/test/standalone/windows_argv/verify.c
@ -0,0 +1,7 @@
+#include <windows.h>
+#include "lib.h"
+
+int wmain(int argc, wchar_t *argv[]) {
+	if (!verify(argc, argv)) return 1;
+	return 0;
+}