From 69f0a5587d0db07546e91968b21135fdef856136 Mon Sep 17 00:00:00 2001 From: Jonathan Marler Date: Fri, 4 Feb 2022 12:08:38 -0700 Subject: [PATCH] remove extra storage from EnvMap on windows --- lib/std/child_process.zig | 14 +- lib/std/process.zig | 443 ++++++++++---------------------------- 2 files changed, 122 insertions(+), 335 deletions(-) diff --git a/lib/std/child_process.zig b/lib/std/child_process.zig index 48f0776465..0bb737decb 100644 --- a/lib/std/child_process.zig +++ b/lib/std/child_process.zig @@ -1245,7 +1245,7 @@ pub fn createWindowsEnvBlock(allocator: mem.Allocator, env_map: *const EnvMap) ! while (it.next()) |pair| { // +1 for '=' // +1 for null byte - max_chars_needed += pair.name.len + pair.value.len + 2; + max_chars_needed += pair.key_ptr.len + pair.value_ptr.len + 2; } break :x max_chars_needed; }; @@ -1255,10 +1255,10 @@ pub fn createWindowsEnvBlock(allocator: mem.Allocator, env_map: *const EnvMap) ! var it = env_map.iterator(); var i: usize = 0; while (it.next()) |pair| { - i += try unicode.utf8ToUtf16Le(result[i..], pair.name); + i += try unicode.utf8ToUtf16Le(result[i..], pair.key_ptr.*); result[i] = '='; i += 1; - i += try unicode.utf8ToUtf16Le(result[i..], pair.value); + i += try unicode.utf8ToUtf16Le(result[i..], pair.value_ptr.*); result[i] = 0; i += 1; } @@ -1280,10 +1280,10 @@ pub fn createNullDelimitedEnvMap(arena: mem.Allocator, env_map: *const EnvMap) ! var it = env_map.iterator(); var i: usize = 0; while (it.next()) |pair| : (i += 1) { - const env_buf = try arena.allocSentinel(u8, pair.name.len + pair.value.len + 1, 0); - mem.copy(u8, env_buf, pair.name); - env_buf[pair.name.len] = '='; - mem.copy(u8, env_buf[pair.name.len + 1 ..], pair.value); + const env_buf = try arena.allocSentinel(u8, pair.key_ptr.len + pair.value_ptr.len + 1, 0); + mem.copy(u8, env_buf, pair.key_ptr.*); + env_buf[pair.key_ptr.len] = '='; + mem.copy(u8, env_buf[pair.key_ptr.len + 1 ..], pair.value_ptr.*); envp_buf[i] = env_buf.ptr; } assert(i == envp_count); diff --git a/lib/std/process.zig b/lib/std/process.zig index 712c8e9317..0b891bbdf5 100644 --- a/lib/std/process.zig +++ b/lib/std/process.zig @@ -52,341 +52,128 @@ test "getCwdAlloc" { testing.allocator.free(cwd); } -/// EnvMap for Windows that handles Unicode-aware case insensitivity for lookups, while also -/// providing the canonical environment variable names when iterating. -/// -/// Allows for zero-allocation lookups (even though it needs to do UTF-8 -> UTF-16 -> uppercase -/// conversions) by allocating a buffer large enough to fit the largest environment variable -/// name, and using that when doing lookups (i.e. anything that overflows the buffer can be treated -/// as the environment variable not being found). -pub const EnvMapWindows = struct { - allocator: Allocator, - /// Keys are UTF-16le stored as []const u8 - uppercased_map: std.StringHashMapUnmanaged(EnvValue), - /// Buffer for converting to uppercased UTF-16 on key lookups - /// Must call `reallocUppercaseBuf` before doing any lookups after a `put` call. - uppercase_buf_utf16: []u16 = &[_]u16{}, - max_name_utf16_length: usize = 0, - - pub const EnvValue = struct { - value: []const u8, - canonical_name: []const u8, - }; - - const Self = @This(); - - /// Deinitialize with `deinit`. - pub fn init(allocator: Allocator) Self { - return .{ - .allocator = allocator, - .uppercased_map = std.StringHashMapUnmanaged(EnvValue){}, - }; - } - - pub fn deinit(self: *Self) void { - var it = self.uppercased_map.iterator(); - while (it.next()) |entry| { - self.allocator.free(entry.key_ptr.*); - self.allocator.free(entry.value_ptr.value); - self.allocator.free(entry.value_ptr.canonical_name); - } - self.uppercased_map.deinit(self.allocator); - self.allocator.free(self.uppercase_buf_utf16); - } - - /// Increases the size of the uppercase buffer if the maximum name size has increased. - /// Must be called before any `get` calls after any number of `put` calls. - pub fn reallocUppercaseBuf(self: *Self) !void { - if (self.max_name_utf16_length > self.uppercase_buf_utf16.len) { - self.uppercase_buf_utf16 = try self.allocator.realloc(self.uppercase_buf_utf16, self.max_name_utf16_length); - } - } - - /// Converts `src` to uppercase using `RtlUpcaseUnicodeString` and puts the result in `dest`. - /// Returns the length of the converted UTF-16 string. `dest.len` must be >= `src.len`. - /// - /// Note: As of now, RtlUpcaseUnicodeString does not seem to handle codepoints above 0x10000 - /// (i.e. those that require a surrogate pair), so this function will always return a length - /// equal to `src.len`. However, if RtlUpcaseUnicodeString is updated to handle codepoints above - /// 0x10000, this property would still hold unless there are lowercase <-> uppercase conversions - /// that cross over the boundary between codepoints >= 0x10000 and < 0x10000. - /// TODO: Is it feasible that Unicode lowercase <-> uppercase conversions could cross that boundary? - fn uppercaseName(dest: []u16, src: []const u16) u16 { - assert(dest.len >= src.len); - - const dest_bytes = @intCast(u16, dest.len * 2); - var dest_string = os.windows.UNICODE_STRING{ - .Length = dest_bytes, - .MaximumLength = dest_bytes, - .Buffer = @intToPtr([*]u16, @ptrToInt(dest.ptr)), - }; - const src_bytes = @intCast(u16, src.len * 2); - const src_string = os.windows.UNICODE_STRING{ - .Length = src_bytes, - .MaximumLength = src_bytes, - .Buffer = @intToPtr([*]u16, @ptrToInt(src.ptr)), - }; - const rc = os.windows.ntdll.RtlUpcaseUnicodeString(&dest_string, &src_string, os.windows.FALSE); - switch (rc) { - .SUCCESS => return dest_string.Length / 2, - else => unreachable, // we are not allocating, so no errors should be possible - } - } - - /// Note: Does not realloc the uppercase buf to allow for calling put for many variables and - /// only allocating the uppercase buf afterwards. - pub fn putUtf8(self: *Self, name: []const u8, value: []const u8) !void { - const uppercased_len = len: { - const name_uppercased_utf16 = uppercased: { - var name_utf16_buf = try std.ArrayListAligned(u8, @alignOf(u16)).initCapacity(self.allocator, name.len); - errdefer name_utf16_buf.deinit(); - - const bytes_written = try std.unicode.utf8ToUtf16LeWriter(name_utf16_buf.writer(), name); - var name_utf16 = name_utf16_buf.items[0..bytes_written]; - - // uppercase in place - var name_uppercased_utf16 = std.mem.bytesAsSlice(u16, name_utf16); - const uppercased_len = uppercaseName(name_uppercased_utf16, name_uppercased_utf16); - assert(uppercased_len == name_uppercased_utf16.len); - - break :uppercased name_utf16_buf.toOwnedSlice(); - }; - errdefer self.allocator.free(name_uppercased_utf16); - - const name_canonical = try self.allocator.dupe(u8, name); - errdefer self.allocator.free(name_canonical); - - const value_dupe = try self.allocator.dupe(u8, value); - errdefer self.allocator.free(value_dupe); - - const get_or_put = try self.uppercased_map.getOrPut(self.allocator, name_uppercased_utf16); - if (get_or_put.found_existing) { - // note: this is only safe from UAF because the errdefer that frees this value above - // no longer has a possibility of being triggered after this point - self.allocator.free(name_uppercased_utf16); - self.allocator.free(get_or_put.value_ptr.value); - self.allocator.free(get_or_put.value_ptr.canonical_name); - } else { - get_or_put.key_ptr.* = name_uppercased_utf16; - } - get_or_put.value_ptr.value = value_dupe; - get_or_put.value_ptr.canonical_name = name_canonical; - - break :len name_uppercased_utf16.len; - }; - - // The buffer for case conversion for key lookups will need to be as big as the largest - // key stored in the hash map. - self.max_name_utf16_length = @maximum(self.max_name_utf16_length, uppercased_len); - } - - /// Asserts that the name does not already exist in the map. - /// Note: Does not realloc the uppercase buf to allow for calling put for many variables and - /// only allocating the uppercase buf afterwards. - pub fn putUtf16NoClobber(self: *Self, name_utf16: []const u16, value_utf16: []const u16) !void { - const uppercased_len = len: { - const name_canonical = try std.unicode.utf16leToUtf8Alloc(self.allocator, name_utf16); - errdefer self.allocator.free(name_canonical); - - const value = try std.unicode.utf16leToUtf8Alloc(self.allocator, value_utf16); - errdefer self.allocator.free(value); - - const name_uppercased_utf16 = try self.allocator.alloc(u16, name_utf16.len); - errdefer self.allocator.free(name_uppercased_utf16); - - const uppercased_len = uppercaseName(name_uppercased_utf16, name_utf16); - assert(uppercased_len == name_uppercased_utf16.len); - - try self.uppercased_map.putNoClobber(self.allocator, std.mem.sliceAsBytes(name_uppercased_utf16), EnvValue{ - .value = value, - .canonical_name = name_canonical, - }); - break :len name_uppercased_utf16.len; - }; - - // The buffer for case conversion for key lookups will need to be as big as the largest - // key stored in the hash map. - self.max_name_utf16_length = @maximum(self.max_name_utf16_length, uppercased_len); - } - - /// Attempts to convert a UTF-8 name into a uppercased UTF-16le name for a lookup. If the - /// name cannot be converted, this function will return `null`. - fn utf8ToUppercasedUtf16(self: Self, name: []const u8) ?[]u16 { - const name_utf16: []u16 = to_utf16: { - var utf16_buf_stream = std.io.fixedBufferStream(std.mem.sliceAsBytes(self.uppercase_buf_utf16)); - _ = std.unicode.utf8ToUtf16LeWriter(utf16_buf_stream.writer(), name) catch |err| switch (err) { - // If the buffer isn't large enough, we can treat that as 'env var not found', as we - // know anything too large for the buffer can't be found in the map. - error.NoSpaceLeft => return null, - // Anything with invalid UTF-8 will also not be found in the map, so treat that as - // 'env var not found' too - error.InvalidUtf8 => return null, - }; - break :to_utf16 std.mem.bytesAsSlice(u16, utf16_buf_stream.getWritten()); - }; - - // uppercase in place - const uppercased_len = uppercaseName(name_utf16, name_utf16); - assert(uppercased_len == name_utf16.len); - - return name_utf16; - } - - /// Returns true if an entry was found and deleted, false otherwise. - pub fn remove(self: *Self, name: []const u8) bool { - const name_utf16 = self.utf8ToUppercasedUtf16(name) orelse return false; - const kv = self.uppercased_map.fetchRemove(std.mem.sliceAsBytes(name_utf16)) orelse return false; - self.allocator.free(kv.key); - self.allocator.free(kv.value.value); - self.allocator.free(kv.value.canonical_name); - return true; - } - - pub fn get(self: Self, name: []const u8) ?EnvValue { - const name_utf16 = self.utf8ToUppercasedUtf16(name) orelse return null; - return self.uppercased_map.get(std.mem.sliceAsBytes(name_utf16)); - } - - pub fn count(self: Self) EnvMap.Size { - return self.uppercased_map.count(); - } - - pub fn iterator(self: *const Self) Iterator { - return .{ - .env_map = self, - .uppercased_map_iterator = self.uppercased_map.iterator(), - }; - } - - pub const Iterator = struct { - env_map: *const Self, - uppercased_map_iterator: std.StringHashMapUnmanaged(EnvValue).Iterator, - - pub fn next(it: *Iterator) ?EnvMap.Entry { - if (it.uppercased_map_iterator.next()) |uppercased_entry| { - return EnvMap.Entry{ - .name = uppercased_entry.value_ptr.canonical_name, - .value = uppercased_entry.value_ptr.value, - }; - } else { - return null; - } - } - }; -}; - -test "EnvMapWindows" { - if (builtin.os.tag != .windows) return error.SkipZigTest; - - var env_map = EnvMapWindows.init(testing.allocator); - defer env_map.deinit(); - - // both put methods - try env_map.putUtf16NoClobber(std.unicode.utf8ToUtf16LeStringLiteral("Path"), std.unicode.utf8ToUtf16LeStringLiteral("something")); - try env_map.putUtf8("КИРиллИЦА", "something else"); - try env_map.reallocUppercaseBuf(); - - try testing.expectEqual(@as(EnvMap.Size, 2), env_map.count()); - - // unicode-aware case-insensitive lookups - try testing.expectEqualStrings("something", env_map.get("PATH").?.value); - try testing.expectEqualStrings("something else", env_map.get("кириллица").?.value); - try testing.expect(env_map.get("missing") == null); - - // canonical names when iterating - var it = env_map.iterator(); - var count: EnvMap.Size = 0; - while (it.next()) |entry| { - const is_an_expected_name = std.mem.eql(u8, "Path", entry.name) or std.mem.eql(u8, "КИРиллИЦА", entry.name); - try testing.expect(is_an_expected_name); - count += 1; - } - try testing.expectEqual(@as(EnvMap.Size, 2), count); -} - pub const EnvMap = struct { - storage: StorageType, + hash_map: HashMap, - pub const StorageType = switch (builtin.os.tag) { - .windows => EnvMapWindows, - else => std.BufMap, - }; + const HashMap = std.HashMap( + []const u8, + []const u8, + EnvNameHashContext, + std.hash_map.default_max_load_percentage, + ); - pub const Size = std.BufMap.BufMapHashMap.Size; - - const Self = @This(); - - /// Deinitialize with `deinit`. - pub fn init(allocator: Allocator) Self { - return Self{ .storage = StorageType.init(allocator) }; - } - - pub fn deinit(self: *Self) void { - self.storage.deinit(); - } - - pub fn get(self: Self, name: []const u8) ?[]const u8 { - switch (builtin.os.tag) { - .windows => { - if (self.storage.get(name)) |entry| { - return entry.value; - } else { - return null; + pub const EnvNameHashContext = struct { + pub fn hash(self: @This(), s: []const u8) u64 { + _ = self; + if (builtin.os.tag == .windows) { + const h = std.hash.Wyhash.init(0); + // TODO: improve this, instead of iterating over ascii, + // iterate over with unicode + for (s) |c| { + var s_upper = [_]u8 { std.ascii.toLower(c) }; + h.update(s_upper); } - }, - else => return self.storage.get(name), - } - } - - pub fn count(self: Self) Size { - return self.storage.count(); - } - - pub fn iterator(self: *const Self) Iterator { - return .{ .storage_iterator = self.storage.iterator() }; - } - - pub fn put(self: *Self, name: []const u8, value: []const u8) !void { - switch (builtin.os.tag) { - .windows => { - try self.storage.putUtf8(name, value); - try self.storage.reallocUppercaseBuf(); - }, - else => return self.storage.put(name, value), - } - } - - pub fn remove(self: *Self, name: []const u8) void { - _ = self.storage.remove(name); - } - - pub const Entry = struct { - name: []const u8, - value: []const u8, - }; - - pub const Iterator = struct { - storage_iterator: switch (builtin.os.tag) { - .windows => EnvMapWindows.Iterator, - else => std.BufMap.BufMapHashMap.Iterator, - }, - - pub fn next(it: *Iterator) ?Entry { - switch (builtin.os.tag) { - .windows => return it.storage_iterator.next(), - else => { - if (it.storage_iterator.next()) |entry| { - return Entry{ - .name = entry.key_ptr.*, - .value = entry.value_ptr.*, - }; - } else { - return null; - } - }, + return h.final(); } + return std.hash_map.hashString(s); + } + pub fn eql(self: @This(), a: []const u8, b: []const u8) bool { + _ = self; + if (builtin.os.tag == .windows) { + // TODO: improve this, instead of comparing ascii + // compare with unicode + return std.ascii.eqlIgnoreCase(a, b); + } + return std.hash_map.eqlString(a, b); } }; + + /// Create a EnvMap backed by a specific allocator. + /// That allocator will be used for both backing allocations + /// and string deduplication. + pub fn init(allocator: Allocator) EnvMap { + return EnvMap{ .hash_map = HashMap.init(allocator) }; + } + + /// Free the backing storage of the map, as well as all + /// of the stored keys and values. + pub fn deinit(self: *EnvMap) void { + var it = self.hash_map.iterator(); + while (it.next()) |entry| { + self.free(entry.key_ptr.*); + self.free(entry.value_ptr.*); + } + + self.hash_map.deinit(); + } + + /// Same as `put` but the key and value become owned by the EnvMap rather + /// than being copied. + /// If `putMove` fails, the ownership of key and value does not transfer. + pub fn putMove(self: *EnvMap, key: []u8, value: []u8) !void { + const get_or_put = try self.hash_map.getOrPut(key); + if (get_or_put.found_existing) { + self.free(get_or_put.key_ptr.*); + self.free(get_or_put.value_ptr.*); + get_or_put.key_ptr.* = key; + } + get_or_put.value_ptr.* = value; + } + + /// `key` and `value` are copied into the EnvMap. + pub fn put(self: *EnvMap, key: []const u8, value: []const u8) !void { + const value_copy = try self.copy(value); + errdefer self.free(value_copy); + const get_or_put = try self.hash_map.getOrPut(key); + if (get_or_put.found_existing) { + self.free(get_or_put.value_ptr.*); + } else { + get_or_put.key_ptr.* = self.copy(key) catch |err| { + _ = self.hash_map.remove(key); + return err; + }; + } + get_or_put.value_ptr.* = value_copy; + } + + /// Find the address of the value associated with a key. + /// The returned pointer is invalidated if the map resizes. + pub fn getPtr(self: EnvMap, key: []const u8) ?*[]const u8 { + return self.hash_map.getPtr(key); + } + + /// Return the map's copy of the value associated with + /// a key. The returned string is invalidated if this + /// key is removed from the map. + pub fn get(self: EnvMap, key: []const u8) ?[]const u8 { + return self.hash_map.get(key); + } + + /// Removes the item from the map and frees its value. + /// This invalidates the value returned by get() for this key. + pub fn remove(self: *EnvMap, key: []const u8) void { + const kv = self.hash_map.fetchRemove(key) orelse return; + self.free(kv.key); + self.free(kv.value); + } + + /// Returns the number of KV pairs stored in the map. + pub fn count(self: EnvMap) HashMap.Size { + return self.hash_map.count(); + } + + /// Returns an iterator over entries in the map. + pub fn iterator(self: *const EnvMap) HashMap.Iterator { + return self.hash_map.iterator(); + } + + fn free(self: EnvMap, value: []const u8) void { + self.hash_map.allocator.free(value); + } + + fn copy(self: EnvMap, value: []const u8) ![]u8 { + return self.hash_map.allocator.dupe(u8, value); + } }; test "EnvMap" {