From cf88cf2657d721c68055a284e8c498a18639f74c Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 31 Jan 2022 01:10:50 -0700 Subject: [PATCH 1/2] std: make ArrayHashMap eql function accept an additional param which is the index of the key that already exists in the hash map. This enables the use case of using `AutoArrayHashMap(void, void)` which may seem surprising at first, but is actually pretty handy! This commit includes a proof-of-concept of how I want to use it, with a new InternArena abstraction for stage2 that provides a compact way to store values (and types) in an "internment arena", thus making types stored exactly once (per arena), representable with a single u32 as a reference to a type within an InternArena, and comparable with a simple u32 integer comparison. If both types are in the same InternArena, you can check if they are equal by seeing if their index is the same. What's neat about `AutoArrayHashMap(void, void)` is that it allows us to look up the indexes by key, *without actually storing the keys*. Instead, keys are treated as ephemeral values that are constructed as needed. As a result, we have an extremely efficient encoding of types and values, represented only by three arrays, which has no pointers, and can therefore be serialized and deserialized by a single writev/readv call. The `map` field is denormalized data and can be computed from the other two fields. This is in contrast to our current Type/Value system which makes extensive use of pointers. The test at the bottom of InternArena.zig passes in this commit. --- lib/std/array_hash_map.zig | 38 +++-- lib/std/builtin.zig | 3 + lib/std/hash/auto_hash.zig | 3 +- lib/std/hash_map.zig | 23 ++- src/InternArena.zig | 315 +++++++++++++++++++++++++++++++++++++ 5 files changed, 357 insertions(+), 25 deletions(-) create mode 100644 src/InternArena.zig diff --git a/lib/std/array_hash_map.zig b/lib/std/array_hash_map.zig index 7ebafc0a1b..4359c30083 100644 --- a/lib/std/array_hash_map.zig +++ b/lib/std/array_hash_map.zig @@ -37,8 +37,9 @@ pub const StringContext = struct { _ = self; return hashString(s); } - pub fn eql(self: @This(), a: []const u8, b: []const u8) bool { + pub fn eql(self: @This(), a: []const u8, b: []const u8, b_index: usize) bool { _ = self; + _ = b_index; return eqlString(a, b); } }; @@ -76,7 +77,7 @@ pub fn ArrayHashMap( comptime Context: type, comptime store_hash: bool, ) type { - comptime std.hash_map.verifyContext(Context, K, K, u32); + comptime std.hash_map.verifyContext(Context, K, K, u32, true); return struct { unmanaged: Unmanaged, allocator: Allocator, @@ -462,7 +463,7 @@ pub fn ArrayHashMapUnmanaged( comptime Context: type, comptime store_hash: bool, ) type { - comptime std.hash_map.verifyContext(Context, K, K, u32); + comptime std.hash_map.verifyContext(Context, K, K, u32, true); return struct { /// It is permitted to access this field directly. entries: DataList = .{}, @@ -700,7 +701,7 @@ pub fn ArrayHashMapUnmanaged( const hashes_array = slice.items(.hash); const keys_array = slice.items(.key); for (keys_array) |*item_key, i| { - if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*)) { + if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*, i)) { return GetOrPutResult{ .key_ptr = item_key, // workaround for #6974 @@ -933,7 +934,7 @@ pub fn ArrayHashMapUnmanaged( const hashes_array = slice.items(.hash); const keys_array = slice.items(.key); for (keys_array) |*item_key, i| { - if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*)) { + if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*, i)) { return i; } } @@ -1245,7 +1246,7 @@ pub fn ArrayHashMapUnmanaged( const keys_array = slice.items(.key); for (keys_array) |*item_key, i| { const hash_match = if (store_hash) hashes_array[i] == key_hash else true; - if (hash_match and key_ctx.eql(key, item_key.*)) { + if (hash_match and key_ctx.eql(key, item_key.*, i)) { const removed_entry: KV = .{ .key = keys_array[i], .value = slice.items(.value)[i], @@ -1286,7 +1287,7 @@ pub fn ArrayHashMapUnmanaged( const keys_array = slice.items(.key); for (keys_array) |*item_key, i| { const hash_match = if (store_hash) hashes_array[i] == key_hash else true; - if (hash_match and key_ctx.eql(key, item_key.*)) { + if (hash_match and key_ctx.eql(key, item_key.*, i)) { switch (removal_type) { .swap => self.entries.swapRemove(i), .ordered => self.entries.orderedRemove(i), @@ -1483,8 +1484,9 @@ pub fn ArrayHashMapUnmanaged( // This pointer survives the following append because we call // entries.ensureTotalCapacity before getOrPutInternal. - const hash_match = if (store_hash) h == hashes_array[slot_data.entry_index] else true; - if (hash_match and checkedEql(ctx, key, keys_array[slot_data.entry_index])) { + const i = slot_data.entry_index; + const hash_match = if (store_hash) h == hashes_array[i] else true; + if (hash_match and checkedEql(ctx, key, keys_array[i], i)) { return .{ .found_existing = true, .key_ptr = &keys_array[slot_data.entry_index], @@ -1571,8 +1573,9 @@ pub fn ArrayHashMapUnmanaged( if (slot_data.isEmpty() or slot_data.distance_from_start_index < distance_from_start_index) return null; - const hash_match = if (store_hash) h == hashes_array[slot_data.entry_index] else true; - if (hash_match and checkedEql(ctx, key, keys_array[slot_data.entry_index])) + const i = slot_data.entry_index; + const hash_match = if (store_hash) h == hashes_array[i] else true; + if (hash_match and checkedEql(ctx, key, keys_array[i], i)) return slot; } unreachable; @@ -1624,7 +1627,7 @@ pub fn ArrayHashMapUnmanaged( } inline fn checkedHash(ctx: anytype, key: anytype) u32 { - comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(key), K, u32); + comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(key), K, u32, true); // If you get a compile error on the next line, it means that const hash = ctx.hash(key); // your generic hash function doesn't accept your key if (@TypeOf(hash) != u32) { @@ -1633,10 +1636,10 @@ pub fn ArrayHashMapUnmanaged( } return hash; } - inline fn checkedEql(ctx: anytype, a: anytype, b: K) bool { - comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(a), K, u32); + inline fn checkedEql(ctx: anytype, a: anytype, b: K, b_index: usize) bool { + comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(a), K, u32, true); // If you get a compile error on the next line, it means that - const eql = ctx.eql(a, b); // your generic eql function doesn't accept (self, adapt key, K) + const eql = ctx.eql(a, b, b_index); // your generic eql function doesn't accept (self, adapt key, K, index) if (@TypeOf(eql) != bool) { @compileError("Context " ++ @typeName(@TypeOf(ctx)) ++ " has a generic eql function that returns the wrong type!\n" ++ @typeName(bool) ++ " was expected, but found " ++ @typeName(@TypeOf(eql))); @@ -2255,9 +2258,10 @@ pub fn getAutoHashFn(comptime K: type, comptime Context: type) (fn (Context, K) }.hash; } -pub fn getAutoEqlFn(comptime K: type, comptime Context: type) (fn (Context, K, K) bool) { +pub fn getAutoEqlFn(comptime K: type, comptime Context: type) (fn (Context, K, K, usize) bool) { return struct { - fn eql(ctx: Context, a: K, b: K) bool { + fn eql(ctx: Context, a: K, b: K, b_index: usize) bool { + _ = b_index; _ = ctx; return meta.eql(a, b); } diff --git a/lib/std/builtin.zig b/lib/std/builtin.zig index 3675a90257..faaa0d8272 100644 --- a/lib/std/builtin.zig +++ b/lib/std/builtin.zig @@ -202,12 +202,14 @@ pub const TypeInfo = union(enum) { /// therefore must be kept in sync with the compiler implementation. pub const Int = struct { signedness: Signedness, + /// TODO make this u16 instead of comptime_int bits: comptime_int, }; /// This data structure is used by the Zig language code generation and /// therefore must be kept in sync with the compiler implementation. pub const Float = struct { + /// TODO make this u16 instead of comptime_int bits: comptime_int, }; @@ -217,6 +219,7 @@ pub const TypeInfo = union(enum) { size: Size, is_const: bool, is_volatile: bool, + /// TODO make this u16 instead of comptime_int alignment: comptime_int, address_space: AddressSpace, child: type, diff --git a/lib/std/hash/auto_hash.zig b/lib/std/hash/auto_hash.zig index 22fd6526f4..089d8f4df2 100644 --- a/lib/std/hash/auto_hash.zig +++ b/lib/std/hash/auto_hash.zig @@ -81,7 +81,6 @@ pub fn hash(hasher: anytype, key: anytype, comptime strat: HashStrategy) void { .NoReturn, .Opaque, .Undefined, - .Void, .Null, .ComptimeFloat, .ComptimeInt, @@ -91,6 +90,8 @@ pub fn hash(hasher: anytype, key: anytype, comptime strat: HashStrategy) void { .Float, => @compileError("unable to hash type " ++ @typeName(Key)), + .Void => return, + // Help the optimizer see that hashing an int is easy by inlining! // TODO Check if the situation is better after #561 is resolved. .Int => { diff --git a/lib/std/hash_map.zig b/lib/std/hash_map.zig index 2c0010211a..53d3f7004d 100644 --- a/lib/std/hash_map.zig +++ b/lib/std/hash_map.zig @@ -131,7 +131,13 @@ pub const default_max_load_percentage = 80; /// If you are passing a context to a *Adapted function, PseudoKey is the type /// of the key parameter. Otherwise, when creating a HashMap or HashMapUnmanaged /// type, PseudoKey = Key = K. -pub fn verifyContext(comptime RawContext: type, comptime PseudoKey: type, comptime Key: type, comptime Hash: type) void { +pub fn verifyContext( + comptime RawContext: type, + comptime PseudoKey: type, + comptime Key: type, + comptime Hash: type, + comptime is_array: bool, +) void { comptime { var allow_const_ptr = false; var allow_mutable_ptr = false; @@ -166,7 +172,9 @@ pub fn verifyContext(comptime RawContext: type, comptime PseudoKey: type, compti const prefix = "\n "; const deep_prefix = prefix ++ " "; const hash_signature = "fn (self, " ++ @typeName(PseudoKey) ++ ") " ++ @typeName(Hash); - const eql_signature = "fn (self, " ++ @typeName(PseudoKey) ++ ", " ++ @typeName(Key) ++ ") bool"; + const index_param = if (is_array) ", b_index: usize" else ""; + const eql_signature = "fn (self, " ++ @typeName(PseudoKey) ++ ", " ++ + @typeName(Key) ++ index_param ++ ") bool"; const err_invalid_hash_signature = prefix ++ @typeName(Context) ++ ".hash must be " ++ hash_signature ++ deep_prefix ++ "but is actually " ++ @typeName(@TypeOf(Context.hash)); const err_invalid_eql_signature = prefix ++ @typeName(Context) ++ ".eql must be " ++ eql_signature ++ @@ -255,7 +263,8 @@ pub fn verifyContext(comptime RawContext: type, comptime PseudoKey: type, compti const info = @typeInfo(@TypeOf(eql)); if (info == .Fn) { const func = info.Fn; - if (func.args.len != 3) { + const args_len = if (is_array) 4 else 3; + if (func.args.len != args_len) { errors = errors ++ lazy.err_invalid_eql_signature; } else { var emitted_signature = false; @@ -360,7 +369,7 @@ pub fn HashMap( comptime Context: type, comptime max_load_percentage: u64, ) type { - comptime verifyContext(Context, K, K, u64); + comptime verifyContext(Context, K, K, u64, false); return struct { unmanaged: Unmanaged, allocator: Allocator, @@ -683,7 +692,7 @@ pub fn HashMapUnmanaged( ) type { if (max_load_percentage <= 0 or max_load_percentage >= 100) @compileError("max_load_percentage must be between 0 and 100."); - comptime verifyContext(Context, K, K, u64); + comptime verifyContext(Context, K, K, u64, false); return struct { const Self = @This(); @@ -1108,7 +1117,7 @@ pub fn HashMapUnmanaged( /// from this function. To encourage that, this function is /// marked as inline. inline fn getIndex(self: Self, key: anytype, ctx: anytype) ?usize { - comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash); + comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash, false); if (self.size == 0) { return null; @@ -1291,7 +1300,7 @@ pub fn HashMapUnmanaged( return result; } pub fn getOrPutAssumeCapacityAdapted(self: *Self, key: anytype, ctx: anytype) GetOrPutResult { - comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash); + comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash, false); // If you get a compile error on this line, it means that your generic hash // function is invalid for these parameters. diff --git a/src/InternArena.zig b/src/InternArena.zig new file mode 100644 index 0000000000..a8e9c75e50 --- /dev/null +++ b/src/InternArena.zig @@ -0,0 +1,315 @@ +map: std.AutoArrayHashMapUnmanaged(void, void) = .{}, +items: std.MultiArrayList(Item) = .{}, +extra: std.ArrayListUnmanaged(u32) = .{}, + +const InternArena = @This(); +const std = @import("std"); +const Allocator = std.mem.Allocator; +const assert = std.debug.assert; + +const KeyAdapter = struct { + intern_arena: *const InternArena, + + pub fn eql(ctx: @This(), a: Key, b_void: void, b_map_index: usize) bool { + _ = b_void; + return ctx.intern_arena.indexToKey(@intToEnum(Index, b_map_index)).eql(a); + } + + pub fn hash(ctx: @This(), a: Key) u32 { + _ = ctx; + return a.hash(); + } +}; + +pub const Key = union(enum) { + int_type: struct { + signedness: std.builtin.Signedness, + bits: u16, + }, + ptr_type: struct { + elem_type: Index, + sentinel: Index, + alignment: u16, + size: std.builtin.TypeInfo.Pointer.Size, + is_const: bool, + is_volatile: bool, + is_allowzero: bool, + address_space: std.builtin.AddressSpace, + }, + array_type: struct { + len: u64, + child: Index, + sentinel: Index, + }, + vector_type: struct { + len: u32, + child: Index, + }, + optional_type: struct { + payload_type: Index, + }, + error_union_type: struct { + error_set_type: Index, + payload_type: Index, + }, + simple: Simple, + + pub fn hash(key: Key) u32 { + var hasher = std.hash.Wyhash.init(0); + switch (key) { + .int_type => |int_type| { + std.hash.autoHash(&hasher, int_type); + }, + .array_type => |array_type| { + std.hash.autoHash(&hasher, array_type); + }, + else => @panic("TODO"), + } + return @truncate(u32, hasher.final()); + } + + pub fn eql(a: Key, b: Key) bool { + const KeyTag = std.meta.Tag(Key); + const a_tag: KeyTag = a; + const b_tag: KeyTag = b; + if (a_tag != b_tag) return false; + switch (a) { + .int_type => |a_info| { + const b_info = b.int_type; + return std.meta.eql(a_info, b_info); + }, + .array_type => |a_info| { + const b_info = b.array_type; + return std.meta.eql(a_info, b_info); + }, + else => @panic("TODO"), + } + } +}; + +pub const Item = struct { + tag: Tag, + /// The doc comments on the respective Tag explain how to interpret this. + data: u32, +}; + +/// Represents an index into `map`. It represents the canonical index +/// of a `Value` within this `InternArena`. The values are typed. +/// Two values which have the same type can be equality compared simply +/// by checking if their indexes are equal, provided they are both in +/// the same `InternArena`. +pub const Index = enum(u32) { + none = std.math.maxInt(u32), + _, +}; + +pub const Tag = enum(u8) { + /// An integer type. + /// data is number of bits + type_int_signed, + /// An integer type. + /// data is number of bits + type_int_unsigned, + /// An array type. + /// data is payload to Array. + type_array, + /// A type or value that can be represented with only an enum tag. + /// data is Simple enum value + simple, + /// An unsigned integer value that can be represented by u32. + /// data is integer value + int_u32, + /// An unsigned integer value that can be represented by i32. + /// data is integer value bitcasted to u32. + int_i32, + /// A positive integer value that does not fit in 32 bits. + /// data is a extra index to BigInt. + int_big_positive, + /// A negative integer value that does not fit in 32 bits. + /// data is a extra index to BigInt. + int_big_negative, + /// A float value that can be represented by f32. + /// data is float value bitcasted to u32. + float_f32, + /// A float value that can be represented by f64. + /// data is payload index to Float64. + float_f64, + /// A float value that can be represented by f128. + /// data is payload index to Float128. + float_f128, +}; + +pub const Simple = enum(u32) { + f16, + f32, + f64, + f80, + f128, + usize, + isize, + c_short, + c_ushort, + c_int, + c_uint, + c_long, + c_ulong, + c_longlong, + c_ulonglong, + c_longdouble, + anyopaque, + bool, + void, + type, + anyerror, + comptime_int, + comptime_float, + noreturn, + @"anyframe", + null_type, + undefined_type, + enum_literal_type, + @"undefined", + void_value, + @"null", + bool_true, + bool_false, +}; + +pub const Array = struct { + len: u32, + child: Index, +}; + +pub fn deinit(ia: *InternArena, gpa: Allocator) void { + ia.map.deinit(gpa); + ia.items.deinit(gpa); + ia.extra.deinit(gpa); +} + +pub fn indexToKey(ia: InternArena, index: Index) Key { + const data = ia.items.items(.data)[@enumToInt(index)]; + return switch (ia.items.items(.tag)[@enumToInt(index)]) { + .type_int_signed => .{ + .int_type = .{ + .signedness = .signed, + .bits = @intCast(u16, data), + }, + }, + .type_int_unsigned => .{ + .int_type = .{ + .signedness = .unsigned, + .bits = @intCast(u16, data), + }, + }, + .type_array => { + const array_info = ia.extraData(Array, data); + return .{ .array_type = .{ + .len = array_info.len, + .child = array_info.child, + .sentinel = .none, + } }; + }, + .simple => .{ .simple = @intToEnum(Simple, data) }, + + else => @panic("TODO"), + }; +} + +pub fn get(ia: *InternArena, gpa: Allocator, key: Key) Allocator.Error!Index { + const adapter: KeyAdapter = .{ .intern_arena = ia }; + const gop = try ia.map.getOrPutAdapted(gpa, key, adapter); + if (gop.found_existing) { + return @intToEnum(Index, gop.index); + } + switch (key) { + .int_type => |int_type| { + const tag: Tag = switch (int_type.signedness) { + .signed => .type_int_signed, + .unsigned => .type_int_unsigned, + }; + try ia.items.append(gpa, .{ + .tag = tag, + .data = int_type.bits, + }); + }, + .array_type => |array_type| { + const len = @intCast(u32, array_type.len); // TODO have a big_array encoding + assert(array_type.sentinel == .none); // TODO have a sentinel_array encoding + try ia.items.append(gpa, .{ + .tag = .type_array, + .data = try ia.addExtra(gpa, Array{ + .len = len, + .child = array_type.child, + }), + }); + }, + else => @panic("TODO"), + } + return @intToEnum(Index, ia.items.len - 1); +} + +fn addExtra(ia: *InternArena, gpa: Allocator, extra: anytype) Allocator.Error!u32 { + const fields = std.meta.fields(@TypeOf(extra)); + try ia.extra.ensureUnusedCapacity(gpa, fields.len); + return ia.addExtraAssumeCapacity(extra); +} + +fn addExtraAssumeCapacity(ia: *InternArena, extra: anytype) u32 { + const fields = std.meta.fields(@TypeOf(extra)); + const result = @intCast(u32, ia.extra.items.len); + inline for (fields) |field| { + ia.extra.appendAssumeCapacity(switch (field.field_type) { + u32 => @field(extra, field.name), + Index => @enumToInt(@field(extra, field.name)), + i32 => @bitCast(u32, @field(extra, field.name)), + else => @compileError("bad field type"), + }); + } + return result; +} + +fn extraData(ia: InternArena, comptime T: type, index: usize) T { + const fields = std.meta.fields(T); + var i: usize = index; + var result: T = undefined; + inline for (fields) |field| { + @field(result, field.name) = switch (field.field_type) { + u32 => ia.extra.items[i], + Index => @intToEnum(Index, ia.extra.items[i]), + i32 => @bitCast(i32, ia.extra.items[i]), + else => @compileError("bad field type"), + }; + i += 1; + } + return result; +} + +test "basic usage" { + const gpa = std.testing.allocator; + + var ia: InternArena = .{}; + defer ia.deinit(gpa); + + const i32_type = try ia.get(gpa, .{ .int_type = .{ + .signedness = .signed, + .bits = 32, + } }); + const array_i32 = try ia.get(gpa, .{ .array_type = .{ + .len = 10, + .child = i32_type, + .sentinel = .none, + } }); + + const another_i32_type = try ia.get(gpa, .{ .int_type = .{ + .signedness = .signed, + .bits = 32, + } }); + try std.testing.expect(another_i32_type == i32_type); + + const another_array_i32 = try ia.get(gpa, .{ .array_type = .{ + .len = 10, + .child = i32_type, + .sentinel = .none, + } }); + try std.testing.expect(another_array_i32 == array_i32); +} From 39983d7ff524a3e1e25dbd6904e28a6dd11120e6 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 31 Jan 2022 21:12:37 -0700 Subject: [PATCH 2/2] stage2: update to new ArrayHashMap API --- src/InternArena.zig | 5 +++-- src/codegen/spirv/type.zig | 3 ++- src/type.zig | 3 ++- src/value.zig | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/InternArena.zig b/src/InternArena.zig index a8e9c75e50..5dba21e11c 100644 --- a/src/InternArena.zig +++ b/src/InternArena.zig @@ -187,8 +187,9 @@ pub fn deinit(ia: *InternArena, gpa: Allocator) void { } pub fn indexToKey(ia: InternArena, index: Index) Key { - const data = ia.items.items(.data)[@enumToInt(index)]; - return switch (ia.items.items(.tag)[@enumToInt(index)]) { + const item = ia.items.get(@enumToInt(index)); + const data = item.data; + return switch (item.tag) { .type_int_signed => .{ .int_type = .{ .signedness = .signed, diff --git a/src/codegen/spirv/type.zig b/src/codegen/spirv/type.zig index 3eef2d8040..d9504f4d23 100644 --- a/src/codegen/spirv/type.zig +++ b/src/codegen/spirv/type.zig @@ -157,8 +157,9 @@ pub const Type = extern union { _ = self; return @truncate(u32, t.hashShallow()); } - pub fn eql(self: @This(), a: Type, b: Type) bool { + pub fn eql(self: @This(), a: Type, b: Type, b_index: usize) bool { _ = self; + _ = b_index; return a.eqlShallow(b); } }; diff --git a/src/type.zig b/src/type.zig index 23a741eed0..5632629bff 100644 --- a/src/type.zig +++ b/src/type.zig @@ -798,8 +798,9 @@ pub const Type = extern union { _ = self; return @truncate(u32, t.hash()); } - pub fn eql(self: @This(), a: Type, b: Type) bool { + pub fn eql(self: @This(), a: Type, b: Type, b_index: usize) bool { _ = self; + _ = b_index; return a.eql(b); } }; diff --git a/src/value.zig b/src/value.zig index df1531533b..75c3aac6e7 100644 --- a/src/value.zig +++ b/src/value.zig @@ -1765,7 +1765,8 @@ pub const Value = extern union { const other_context: HashContext = .{ .ty = self.ty }; return @truncate(u32, other_context.hash(val)); } - pub fn eql(self: @This(), a: Value, b: Value) bool { + pub fn eql(self: @This(), a: Value, b: Value, b_index: usize) bool { + _ = b_index; return a.eql(b, self.ty); } };