mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 06:13:07 +00:00
ArgIteratorWindows: Store last emitted code unit instead of checking the last 6 emitted bytes
Previously, to ensure args were encoded as well-formed WTF-8 (i.e. no encoded surrogate pairs), the code unit would be encoded and then the last 6 emitted bytes would be checked to see if they were a surrogate pair, and this was done for any emitted code unit (although this was not necessary, it should have only been done when emitting a low surrogate).
After this commit, we still want to ensure well-formed WTF-8, but, to do so, the last emitted code point is stored, meaning we can just directly check that the last code unit is a high surrogate and the current code unit is a low surrogate to determine if we have a surrogate pair.
This provides some performance benefit over and above a "use the same strategy as before but only check when we're emitting a low surrogate" implementation:
Benchmark 1 (111 runs): benchargv-master.exe
measurement mean ± σ min … max outliers delta
wall_time 45.2ms ± 532us 44.5ms … 49.4ms 2 ( 2%) 0%
peak_rss 6.49MB ± 3.94KB 6.46MB … 6.49MB 10 ( 9%) 0%
Benchmark 2 (154 runs): benchargv-storelast.exe
measurement mean ± σ min … max outliers delta
wall_time 32.6ms ± 293us 32.2ms … 34.2ms 8 ( 5%) ⚡- 27.8% ± 0.2%
peak_rss 6.49MB ± 5.15KB 6.46MB … 6.49MB 15 (10%) - 0.0% ± 0.0%
Benchmark 3 (131 runs): benchargv-onlylow.exe
measurement mean ± σ min … max outliers delta
wall_time 38.4ms ± 257us 37.9ms … 39.6ms 5 ( 4%) ⚡- 15.1% ± 0.2%
peak_rss 6.49MB ± 5.70KB 6.46MB … 6.49MB 9 ( 7%) - 0.0% ± 0.0%
This commit is contained in:
parent
10914dc310
commit
1418c8a5d4
@ -717,14 +717,20 @@ pub const ArgIteratorWindows = struct {
|
|||||||
|
|
||||||
const eof = null;
|
const eof = null;
|
||||||
|
|
||||||
fn emitBackslashes(self: *ArgIteratorWindows, count: usize) void {
|
/// Returns '\' if any backslashes are emitted, otherwise returns `last_emitted_code_unit`.
|
||||||
for (0..count) |_| emitCharacter(self, '\\');
|
fn emitBackslashes(self: *ArgIteratorWindows, count: usize, last_emitted_code_unit: ?u16) ?u16 {
|
||||||
|
for (0..count) |_| {
|
||||||
|
self.buffer[self.end] = '\\';
|
||||||
|
self.end += 1;
|
||||||
|
}
|
||||||
|
return if (count != 0) '\\' else last_emitted_code_unit;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn emitCharacter(self: *ArgIteratorWindows, code_unit: u16) void {
|
/// If `last_emitted_code_unit` and `code_unit` form a surrogate pair, then
|
||||||
const wtf8_len = std.unicode.wtf8Encode(code_unit, self.buffer[self.end..]) catch unreachable;
|
/// the previously emitted high surrogate is overwritten by the codepoint encoded
|
||||||
self.end += wtf8_len;
|
/// by the surrogate pair, and `null` is returned.
|
||||||
|
/// Otherwise, `code_unit` is emitted and returned.
|
||||||
|
fn emitCharacter(self: *ArgIteratorWindows, code_unit: u16, last_emitted_code_unit: ?u16) ?u16 {
|
||||||
// Because we are emitting WTF-8, we need to
|
// Because we are emitting WTF-8, we need to
|
||||||
// check to see if we've emitted two consecutive surrogate
|
// check to see if we've emitted two consecutive surrogate
|
||||||
// codepoints that form a valid surrogate pair in order
|
// codepoints that form a valid surrogate pair in order
|
||||||
@ -745,28 +751,24 @@ pub const ArgIteratorWindows = struct {
|
|||||||
// and emit the codepoint it encodes, which in this
|
// and emit the codepoint it encodes, which in this
|
||||||
// example is U+10437 (𐐷), which is encoded in UTF-8 as:
|
// example is U+10437 (𐐷), which is encoded in UTF-8 as:
|
||||||
// <0xF0><0x90><0x90><0xB7>
|
// <0xF0><0x90><0x90><0xB7>
|
||||||
concatSurrogatePair(self);
|
if (last_emitted_code_unit != null and
|
||||||
}
|
std.unicode.utf16IsLowSurrogate(code_unit) and
|
||||||
|
std.unicode.utf16IsHighSurrogate(last_emitted_code_unit.?))
|
||||||
|
{
|
||||||
|
const codepoint = std.unicode.utf16DecodeSurrogatePair(&.{ last_emitted_code_unit.?, code_unit }) catch unreachable;
|
||||||
|
|
||||||
fn concatSurrogatePair(self: *ArgIteratorWindows) void {
|
// Unpaired surrogate is 3 bytes long
|
||||||
// Surrogate codepoints are always encoded as 3 bytes, so there
|
const dest = self.buffer[self.end - 3 ..];
|
||||||
// must be 6 bytes for a surrogate pair to exist.
|
const len = unicode.utf8Encode(codepoint, dest) catch unreachable;
|
||||||
if (self.end - self.start >= 6) {
|
// All codepoints that require a surrogate pair (> U+FFFF) are encoded as 4 bytes
|
||||||
const window = self.buffer[self.end - 6 .. self.end];
|
assert(len == 4);
|
||||||
const view = unicode.Wtf8View.init(window) catch return;
|
self.end += 1;
|
||||||
var it = view.iterator();
|
return null;
|
||||||
var pair: [2]u16 = undefined;
|
|
||||||
pair[0] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
|
|
||||||
if (!unicode.utf16IsHighSurrogate(std.mem.littleToNative(u16, pair[0]))) return;
|
|
||||||
pair[1] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
|
|
||||||
if (!unicode.utf16IsLowSurrogate(std.mem.littleToNative(u16, pair[1]))) return;
|
|
||||||
// We know we have a valid surrogate pair, so convert
|
|
||||||
// it to UTF-8, overwriting the surrogate pair's bytes
|
|
||||||
// and then chop off the extra bytes.
|
|
||||||
const len = unicode.utf16LeToUtf8(window, &pair) catch unreachable;
|
|
||||||
const delta = 6 - len;
|
|
||||||
self.end -= delta;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const wtf8_len = std.unicode.wtf8Encode(code_unit, self.buffer[self.end..]) catch unreachable;
|
||||||
|
self.end += wtf8_len;
|
||||||
|
return code_unit;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 {
|
fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 {
|
||||||
@ -783,9 +785,13 @@ pub const ArgIteratorWindows = struct {
|
|||||||
|
|
||||||
const eof = false;
|
const eof = false;
|
||||||
|
|
||||||
fn emitBackslashes(_: *ArgIteratorWindows, _: usize) void {}
|
fn emitBackslashes(_: *ArgIteratorWindows, _: usize, last_emitted_code_unit: ?u16) ?u16 {
|
||||||
|
return last_emitted_code_unit;
|
||||||
|
}
|
||||||
|
|
||||||
fn emitCharacter(_: *ArgIteratorWindows, _: u16) void {}
|
fn emitCharacter(_: *ArgIteratorWindows, _: u16, last_emitted_code_unit: ?u16) ?u16 {
|
||||||
|
return last_emitted_code_unit;
|
||||||
|
}
|
||||||
|
|
||||||
fn yieldArg(_: *ArgIteratorWindows) bool {
|
fn yieldArg(_: *ArgIteratorWindows) bool {
|
||||||
return true;
|
return true;
|
||||||
@ -793,6 +799,7 @@ pub const ArgIteratorWindows = struct {
|
|||||||
};
|
};
|
||||||
|
|
||||||
fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T {
|
fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T {
|
||||||
|
var last_emitted_code_unit: ?u16 = null;
|
||||||
// The first argument (the executable name) uses different parsing rules.
|
// The first argument (the executable name) uses different parsing rules.
|
||||||
if (self.index == 0) {
|
if (self.index == 0) {
|
||||||
if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) {
|
if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) {
|
||||||
@ -815,15 +822,15 @@ pub const ArgIteratorWindows = struct {
|
|||||||
inside_quotes = !inside_quotes;
|
inside_quotes = !inside_quotes;
|
||||||
},
|
},
|
||||||
' ', '\t' => {
|
' ', '\t' => {
|
||||||
if (inside_quotes)
|
if (inside_quotes) {
|
||||||
strategy.emitCharacter(self, char)
|
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
|
||||||
else {
|
} else {
|
||||||
self.index += 1;
|
self.index += 1;
|
||||||
return strategy.yieldArg(self);
|
return strategy.yieldArg(self);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
else => {
|
else => {
|
||||||
strategy.emitCharacter(self, char);
|
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -861,29 +868,28 @@ pub const ArgIteratorWindows = struct {
|
|||||||
0;
|
0;
|
||||||
switch (char) {
|
switch (char) {
|
||||||
0 => {
|
0 => {
|
||||||
strategy.emitBackslashes(self, backslash_count);
|
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
|
||||||
return strategy.yieldArg(self);
|
return strategy.yieldArg(self);
|
||||||
},
|
},
|
||||||
' ', '\t' => {
|
' ', '\t' => {
|
||||||
strategy.emitBackslashes(self, backslash_count);
|
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
|
||||||
backslash_count = 0;
|
backslash_count = 0;
|
||||||
if (inside_quotes)
|
if (inside_quotes) {
|
||||||
strategy.emitCharacter(self, char)
|
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
|
||||||
else
|
} else return strategy.yieldArg(self);
|
||||||
return strategy.yieldArg(self);
|
|
||||||
},
|
},
|
||||||
'"' => {
|
'"' => {
|
||||||
const char_is_escaped_quote = backslash_count % 2 != 0;
|
const char_is_escaped_quote = backslash_count % 2 != 0;
|
||||||
strategy.emitBackslashes(self, backslash_count / 2);
|
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count / 2, last_emitted_code_unit);
|
||||||
backslash_count = 0;
|
backslash_count = 0;
|
||||||
if (char_is_escaped_quote) {
|
if (char_is_escaped_quote) {
|
||||||
strategy.emitCharacter(self, '"');
|
last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
|
||||||
} else {
|
} else {
|
||||||
if (inside_quotes and
|
if (inside_quotes and
|
||||||
self.index + 1 != self.cmd_line.len and
|
self.index + 1 != self.cmd_line.len and
|
||||||
mem.littleToNative(u16, self.cmd_line[self.index + 1]) == '"')
|
mem.littleToNative(u16, self.cmd_line[self.index + 1]) == '"')
|
||||||
{
|
{
|
||||||
strategy.emitCharacter(self, '"');
|
last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
|
||||||
self.index += 1;
|
self.index += 1;
|
||||||
} else {
|
} else {
|
||||||
inside_quotes = !inside_quotes;
|
inside_quotes = !inside_quotes;
|
||||||
@ -894,9 +900,9 @@ pub const ArgIteratorWindows = struct {
|
|||||||
backslash_count += 1;
|
backslash_count += 1;
|
||||||
},
|
},
|
||||||
else => {
|
else => {
|
||||||
strategy.emitBackslashes(self, backslash_count);
|
last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
|
||||||
backslash_count = 0;
|
backslash_count = 0;
|
||||||
strategy.emitCharacter(self, char);
|
last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user