From 10914dc31000bc5c3b7164a4101808719d3da07f Mon Sep 17 00:00:00 2001
From: Ryan Liptak <squeek502@hotmail.com>
Date: Sat, 13 Jul 2024 15:03:55 -0700
Subject: [PATCH 1/4] ArgIteratorWindows: Clarify buffer length comment

---
 lib/std/process.zig | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/std/process.zig b/lib/std/process.zig
index d40753f19d..542376f580 100644
--- a/lib/std/process.zig
+++ b/lib/std/process.zig
@@ -683,10 +683,12 @@ pub const ArgIteratorWindows = struct {
         const wtf8_len = unicode.calcWtf8Len(cmd_line);
 
         // This buffer must be large enough to contain contiguous NUL-terminated slices
-        // of each argument. For arguments past the first one, space for the NUL-terminator
-        // is guaranteed due to the necessary whitespace between arugments. However, we need
-        // one extra byte to guarantee enough room for the NUL terminator if the command line
-        // ends up being exactly 1 argument long with no quotes, etc.
+        // of each argument.
+        // - During parsing, the length of a parsed argument will always be equal to
+        //   to less than its unparsed length
+        // - The first argument needs one extra byte of space allocated for its NUL
+        //   terminator, but for each subsequent argument the necessary whitespace
+        //   between arguments guarantees room for their NUL terminator(s).
         const buffer = try allocator.alloc(u8, wtf8_len + 1);
         errdefer allocator.free(buffer);
 

From 1418c8a5d49ecc38a20df07151a7fe942992aea4 Mon Sep 17 00:00:00 2001
From: Ryan Liptak <squeek502@hotmail.com>
Date: Sat, 13 Jul 2024 16:54:00 -0700
Subject: [PATCH 2/4] ArgIteratorWindows: Store last emitted code unit instead
 of checking the last 6 emitted bytes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, to ensure args were encoded as well-formed WTF-8 (i.e. no encoded surrogate pairs), the code unit would be encoded and then the last 6 emitted bytes would be checked to see if they were a surrogate pair, and this was done for any emitted code unit (although this was not necessary, it should have only been done when emitting a low surrogate).

After this commit, we still want to ensure well-formed WTF-8, but, to do so, the last emitted code point is stored, meaning we can just directly check that the last code unit is a high surrogate and the current code unit is a low surrogate to determine if we have a surrogate pair.

This provides some performance benefit over and above a "use the same strategy as before but only check when we're emitting a low surrogate" implementation:

    Benchmark 1 (111 runs): benchargv-master.exe
      measurement          mean ± σ            min … max           outliers         delta
      wall_time          45.2ms ±  532us    44.5ms … 49.4ms          2 ( 2%)        0%
      peak_rss           6.49MB ± 3.94KB    6.46MB … 6.49MB         10 ( 9%)        0%
    Benchmark 2 (154 runs): benchargv-storelast.exe
      measurement          mean ± σ            min … max           outliers         delta
      wall_time          32.6ms ±  293us    32.2ms … 34.2ms          8 ( 5%)        ⚡- 27.8% ±  0.2%
      peak_rss           6.49MB ± 5.15KB    6.46MB … 6.49MB         15 (10%)          -  0.0% ±  0.0%
    Benchmark 3 (131 runs): benchargv-onlylow.exe
      measurement          mean ± σ            min … max           outliers         delta
      wall_time          38.4ms ±  257us    37.9ms … 39.6ms          5 ( 4%)        ⚡- 15.1% ±  0.2%
      peak_rss           6.49MB ± 5.70KB    6.46MB … 6.49MB          9 ( 7%)          -  0.0% ±  0.0%
---
 lib/std/process.zig | 92 ++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/lib/std/process.zig b/lib/std/process.zig
index 542376f580..be8576442d 100644
--- a/lib/std/process.zig
+++ b/lib/std/process.zig
@@ -717,14 +717,20 @@ pub const ArgIteratorWindows = struct {
 
         const eof = null;
 
-        fn emitBackslashes(self: *ArgIteratorWindows, count: usize) void {
-            for (0..count) |_| emitCharacter(self, '\\');
+        /// Returns '\' if any backslashes are emitted, otherwise returns `last_emitted_code_unit`.
+        fn emitBackslashes(self: *ArgIteratorWindows, count: usize, last_emitted_code_unit: ?u16) ?u16 {
+            for (0..count) |_| {
+                self.buffer[self.end] = '\\';
+                self.end += 1;
+            }
+            return if (count != 0) '\\' else last_emitted_code_unit;
         }
 
-        fn emitCharacter(self: *ArgIteratorWindows, code_unit: u16) void {
-            const wtf8_len = std.unicode.wtf8Encode(code_unit, self.buffer[self.end..]) catch unreachable;
-            self.end += wtf8_len;
-
+        /// If `last_emitted_code_unit` and `code_unit` form a surrogate pair, then
+        /// the previously emitted high surrogate is overwritten by the codepoint encoded
+        /// by the surrogate pair, and `null` is returned.
+        /// Otherwise, `code_unit` is emitted and returned.
+        fn emitCharacter(self: *ArgIteratorWindows, code_unit: u16, last_emitted_code_unit: ?u16) ?u16 {
             // Because we are emitting WTF-8, we need to
             // check to see if we've emitted two consecutive surrogate
             // codepoints that form a valid surrogate pair in order
@@ -745,28 +751,24 @@ pub const ArgIteratorWindows = struct {
             // and emit the codepoint it encodes, which in this
             // example is U+10437 (𐐷), which is encoded in UTF-8 as:
             // <0xF0><0x90><0x90><0xB7>
-            concatSurrogatePair(self);
-        }
+            if (last_emitted_code_unit != null and
+                std.unicode.utf16IsLowSurrogate(code_unit) and
+                std.unicode.utf16IsHighSurrogate(last_emitted_code_unit.?))
+            {
+                const codepoint = std.unicode.utf16DecodeSurrogatePair(&.{ last_emitted_code_unit.?, code_unit }) catch unreachable;
 
-        fn concatSurrogatePair(self: *ArgIteratorWindows) void {
-            // Surrogate codepoints are always encoded as 3 bytes, so there
-            // must be 6 bytes for a surrogate pair to exist.
-            if (self.end - self.start >= 6) {
-                const window = self.buffer[self.end - 6 .. self.end];
-                const view = unicode.Wtf8View.init(window) catch return;
-                var it = view.iterator();
-                var pair: [2]u16 = undefined;
-                pair[0] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
-                if (!unicode.utf16IsHighSurrogate(std.mem.littleToNative(u16, pair[0]))) return;
-                pair[1] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
-                if (!unicode.utf16IsLowSurrogate(std.mem.littleToNative(u16, pair[1]))) return;
-                // We know we have a valid surrogate pair, so convert
-                // it to UTF-8, overwriting the surrogate pair's bytes
-                // and then chop off the extra bytes.
-                const len = unicode.utf16LeToUtf8(window, &pair) catch unreachable;
-                const delta = 6 - len;
-                self.end -= delta;
+                // Unpaired surrogate is 3 bytes long
+                const dest = self.buffer[self.end - 3 ..];
+                const len = unicode.utf8Encode(codepoint, dest) catch unreachable;
+                // All codepoints that require a surrogate pair (> U+FFFF) are encoded as 4 bytes
+                assert(len == 4);
+                self.end += 1;
+                return null;
             }
+
+            const wtf8_len = std.unicode.wtf8Encode(code_unit, self.buffer[self.end..]) catch unreachable;
+            self.end += wtf8_len;
+            return code_unit;
         }
 
         fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 {
@@ -783,9 +785,13 @@ pub const ArgIteratorWindows = struct {
 
         const eof = false;
 
-        fn emitBackslashes(_: *ArgIteratorWindows, _: usize) void {}
+        fn emitBackslashes(_: *ArgIteratorWindows, _: usize, last_emitted_code_unit: ?u16) ?u16 {
+            return last_emitted_code_unit;
+        }
 
-        fn emitCharacter(_: *ArgIteratorWindows, _: u16) void {}
+        fn emitCharacter(_: *ArgIteratorWindows, _: u16, last_emitted_code_unit: ?u16) ?u16 {
+            return last_emitted_code_unit;
+        }
 
         fn yieldArg(_: *ArgIteratorWindows) bool {
             return true;
@@ -793,6 +799,7 @@ pub const ArgIteratorWindows = struct {
     };
 
     fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T {
+        var last_emitted_code_unit: ?u16 = null;
         // The first argument (the executable name) uses different parsing rules.
         if (self.index == 0) {
             if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) {
@@ -815,15 +822,15 @@ pub const ArgIteratorWindows = struct {
                         inside_quotes = !inside_quotes;
                     },
                     ' ', '\t' => {
-                        if (inside_quotes)
-                            strategy.emitCharacter(self, char)
-                        else {
+                        if (inside_quotes) {
+                            last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
+                        } else {
                             self.index += 1;
                             return strategy.yieldArg(self);
                         }
                     },
                     else => {
-                        strategy.emitCharacter(self, char);
+                        last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
                     },
                 }
             }
@@ -861,29 +868,28 @@ pub const ArgIteratorWindows = struct {
                 0;
             switch (char) {
                 0 => {
-                    strategy.emitBackslashes(self, backslash_count);
+                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
                     return strategy.yieldArg(self);
                 },
                 ' ', '\t' => {
-                    strategy.emitBackslashes(self, backslash_count);
+                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
                     backslash_count = 0;
-                    if (inside_quotes)
-                        strategy.emitCharacter(self, char)
-                    else
-                        return strategy.yieldArg(self);
+                    if (inside_quotes) {
+                        last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
+                    } else return strategy.yieldArg(self);
                 },
                 '"' => {
                     const char_is_escaped_quote = backslash_count % 2 != 0;
-                    strategy.emitBackslashes(self, backslash_count / 2);
+                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count / 2, last_emitted_code_unit);
                     backslash_count = 0;
                     if (char_is_escaped_quote) {
-                        strategy.emitCharacter(self, '"');
+                        last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
                     } else {
                         if (inside_quotes and
                             self.index + 1 != self.cmd_line.len and
                             mem.littleToNative(u16, self.cmd_line[self.index + 1]) == '"')
                         {
-                            strategy.emitCharacter(self, '"');
+                            last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
                             self.index += 1;
                         } else {
                             inside_quotes = !inside_quotes;
@@ -894,9 +900,9 @@ pub const ArgIteratorWindows = struct {
                     backslash_count += 1;
                 },
                 else => {
-                    strategy.emitBackslashes(self, backslash_count);
+                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
                     backslash_count = 0;
-                    strategy.emitCharacter(self, char);
+                    last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
                 },
             }
         }

From 1a62cfffa79917d930b77f2598b1bfc93efeede5 Mon Sep 17 00:00:00 2001
From: Ryan Liptak <squeek502@hotmail.com>
Date: Sat, 13 Jul 2024 17:25:06 -0700
Subject: [PATCH 3/4] Replace GetCommandLineW with PEB access, delete
 GetCommandLine bindings

---
 lib/std/os/windows/kernel32.zig | 3 ---
 lib/std/process.zig             | 5 +++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/lib/std/os/windows/kernel32.zig b/lib/std/os/windows/kernel32.zig
index 3d72b6f1a2..d7c9625bda 100644
--- a/lib/std/os/windows/kernel32.zig
+++ b/lib/std/os/windows/kernel32.zig
@@ -161,9 +161,6 @@ pub extern "kernel32" fn FormatMessageW(dwFlags: DWORD, lpSource: ?LPVOID, dwMes
 
 pub extern "kernel32" fn FreeEnvironmentStringsW(penv: [*:0]u16) callconv(WINAPI) BOOL;
 
-pub extern "kernel32" fn GetCommandLineA() callconv(WINAPI) LPSTR;
-pub extern "kernel32" fn GetCommandLineW() callconv(WINAPI) LPWSTR;
-
 pub extern "kernel32" fn GetConsoleMode(in_hConsoleHandle: HANDLE, out_lpMode: *DWORD) callconv(WINAPI) BOOL;
 pub extern "kernel32" fn SetConsoleMode(in_hConsoleHandle: HANDLE, in_dwMode: DWORD) callconv(WINAPI) BOOL;
 
diff --git a/lib/std/process.zig b/lib/std/process.zig
index be8576442d..434e0fbbcc 100644
--- a/lib/std/process.zig
+++ b/lib/std/process.zig
@@ -1150,8 +1150,9 @@ pub const ArgIterator = struct {
             return ArgIterator{ .inner = try InnerType.init(allocator) };
         }
         if (native_os == .windows) {
-            const cmd_line_w = windows.kernel32.GetCommandLineW();
-            return ArgIterator{ .inner = try InnerType.init(allocator, cmd_line_w) };
+            const cmd_line = std.os.windows.peb().ProcessParameters.CommandLine;
+            const cmd_line_w = cmd_line.Buffer.?[0 .. cmd_line.Length / 2 :0];
+            return ArgIterator{ .inner = try InnerType.init(allocator, cmd_line_w.ptr) };
         }
 
         return ArgIterator{ .inner = InnerType.init() };

From d48251d0f0d2de2f618afe65db3ba425b332d5c4 Mon Sep 17 00:00:00 2001
From: Ryan Liptak <squeek502@hotmail.com>
Date: Sat, 13 Jul 2024 17:29:18 -0700
Subject: [PATCH 4/4] ArgIteratorWindows.init: Take `[]const u16` slice instead
 of multi-item pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we use the PEB to get the precise length of the command line string, there's no need for a multi-item pointer/sliceTo call. This provides a minor speedup:

    Benchmark 1 (153 runs): benchargv-before.exe
      measurement          mean ± σ            min … max           outliers         delta
      wall_time          32.7ms ±  429us    32.1ms … 36.9ms          1 ( 1%)        0%
      peak_rss           6.49MB ± 5.62KB    6.46MB … 6.49MB         14 ( 9%)        0%
    Benchmark 2 (157 runs): benchargv-after.exe
      measurement          mean ± σ            min … max           outliers         delta
      wall_time          31.9ms ±  236us    31.4ms … 32.7ms          4 ( 3%)        ⚡-  2.4% ±  0.2%
      peak_rss           6.49MB ± 4.77KB    6.46MB … 6.49MB         14 ( 9%)          +  0.0% ±  0.0%
---
 lib/std/process.zig | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/lib/std/process.zig b/lib/std/process.zig
index 434e0fbbcc..f4b24a8013 100644
--- a/lib/std/process.zig
+++ b/lib/std/process.zig
@@ -664,7 +664,7 @@ pub const ArgIteratorWasi = struct {
 pub const ArgIteratorWindows = struct {
     allocator: Allocator,
     /// Encoded as WTF-16 LE.
-    cmd_line: [:0]const u16,
+    cmd_line: []const u16,
     index: usize = 0,
     /// Owned by the iterator. Long enough to hold contiguous NUL-terminated slices
     /// of each argument encoded as WTF-8.
@@ -678,9 +678,8 @@ pub const ArgIteratorWindows = struct {
     ///
     /// The iterator stores and uses `cmd_line_w`, so its memory must be valid for
     /// at least as long as the returned ArgIteratorWindows.
-    pub fn init(allocator: Allocator, cmd_line_w: [*:0]const u16) InitError!ArgIteratorWindows {
-        const cmd_line = mem.sliceTo(cmd_line_w, 0);
-        const wtf8_len = unicode.calcWtf8Len(cmd_line);
+    pub fn init(allocator: Allocator, cmd_line_w: []const u16) InitError!ArgIteratorWindows {
+        const wtf8_len = unicode.calcWtf8Len(cmd_line_w);
 
         // This buffer must be large enough to contain contiguous NUL-terminated slices
         // of each argument.
@@ -694,7 +693,7 @@ pub const ArgIteratorWindows = struct {
 
         return .{
             .allocator = allocator,
-            .cmd_line = cmd_line,
+            .cmd_line = cmd_line_w,
             .buffer = buffer,
         };
     }
@@ -1151,8 +1150,8 @@ pub const ArgIterator = struct {
         }
         if (native_os == .windows) {
             const cmd_line = std.os.windows.peb().ProcessParameters.CommandLine;
-            const cmd_line_w = cmd_line.Buffer.?[0 .. cmd_line.Length / 2 :0];
-            return ArgIterator{ .inner = try InnerType.init(allocator, cmd_line_w.ptr) };
+            const cmd_line_w = cmd_line.Buffer.?[0 .. cmd_line.Length / 2];
+            return ArgIterator{ .inner = try InnerType.init(allocator, cmd_line_w) };
         }
 
         return ArgIterator{ .inner = InnerType.init() };