From 818fbd9c567b907031c44961e4299ce1f9059be6 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 24 May 2022 00:10:56 -0700 Subject: [PATCH 1/2] stage2: string literal interning This is a temporary addition to stage2 in order to match stage1 behavior, however the end-game once the lang spec is settled will be to use a global InternPool for comptime memoized objects, making this behavior consistent across all types, not only string literals. Or, we might decide to not guarantee string literals to have equal comptime pointers, in which case this commit can be reverted. --- src/Module.zig | 67 +++++++++++++++++++++++++++--- src/Sema.zig | 92 ++++++++++++++++++++++++++++++++++++------ src/TypedValue.zig | 5 +++ src/codegen.zig | 16 +++++++- src/codegen/llvm.zig | 32 ++++++++++++++- src/value.zig | 41 +++++++++++++++++++ test/behavior/eval.zig | 13 +++++- 7 files changed, 242 insertions(+), 24 deletions(-) diff --git a/src/Module.zig b/src/Module.zig index 11549ccda6..24495d8591 100644 --- a/src/Module.zig +++ b/src/Module.zig @@ -70,6 +70,17 @@ import_table: std.StringArrayHashMapUnmanaged(*File) = .{}, /// Keys are fully resolved file paths. This table owns the keys and values. embed_table: std.StringHashMapUnmanaged(*EmbedFile) = .{}, +/// This is a temporary addition to stage2 in order to match stage1 behavior, +/// however the end-game once the lang spec is settled will be to use a global +/// InternPool for comptime memoized objects, making this behavior consistent across all types, +/// not only string literals. Or, we might decide to not guarantee string literals +/// to have equal comptime pointers, in which case this field can be deleted (perhaps +/// the commit that introduced it can simply be reverted). +/// This table uses an optional index so that when a Decl is destroyed, the string literal +/// is still reclaimable by a future Decl. +string_literal_table: std.HashMapUnmanaged(StringLiteralContext.Key, Decl.OptionalIndex, StringLiteralContext, std.hash_map.default_max_load_percentage) = .{}, +string_literal_bytes: std.ArrayListUnmanaged(u8) = .{}, + /// The set of all the generic function instantiations. This is used so that when a generic /// function is called twice with the same comptime parameter arguments, both calls dispatch /// to the same function. @@ -157,6 +168,39 @@ decls_free_list: std.ArrayListUnmanaged(Decl.Index) = .{}, global_assembly: std.AutoHashMapUnmanaged(Decl.Index, []u8) = .{}, +pub const StringLiteralContext = struct { + bytes: *std.ArrayListUnmanaged(u8), + + pub const Key = struct { + index: u32, + len: u32, + }; + + pub fn eql(self: @This(), a: Key, b: Key) bool { + _ = self; + return a.index == b.index and a.len == b.len; + } + + pub fn hash(self: @This(), x: Key) u64 { + const x_slice = self.bytes.items[x.index..][0..x.len]; + return std.hash_map.hashString(x_slice); + } +}; + +pub const StringLiteralAdapter = struct { + bytes: *std.ArrayListUnmanaged(u8), + + pub fn eql(self: @This(), a_slice: []const u8, b: StringLiteralContext.Key) bool { + const b_slice = self.bytes.items[b.index..][0..b.len]; + return mem.eql(u8, a_slice, b_slice); + } + + pub fn hash(self: @This(), adapted_key: []const u8) u64 { + _ = self; + return std.hash_map.hashString(adapted_key); + } +}; + const MonomorphedFuncsSet = std.HashMapUnmanaged( *Fn, void, @@ -507,7 +551,8 @@ pub const Decl = struct { decl.name = undefined; } - pub fn clearValues(decl: *Decl, gpa: Allocator) void { + pub fn clearValues(decl: *Decl, mod: *Module) void { + const gpa = mod.gpa; if (decl.getExternFn()) |extern_fn| { extern_fn.deinit(gpa); gpa.destroy(extern_fn); @@ -521,6 +566,13 @@ pub const Decl = struct { gpa.destroy(variable); } if (decl.value_arena) |arena_state| { + if (decl.owns_tv) { + if (decl.val.castTag(.str_lit)) |str_lit| { + mod.string_literal_table.getPtrContext(str_lit.data, .{ + .bytes = &mod.string_literal_bytes, + }).?.* = .none; + } + } arena_state.promote(gpa).deinit(); decl.value_arena = null; decl.has_tv = false; @@ -2839,6 +2891,9 @@ pub fn deinit(mod: *Module) void { mod.decls_free_list.deinit(gpa); mod.allocated_decls.deinit(gpa); mod.global_assembly.deinit(gpa); + + mod.string_literal_table.deinit(gpa); + mod.string_literal_bytes.deinit(gpa); } pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void { @@ -2857,7 +2912,7 @@ pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void { if (decl.getInnerNamespace()) |namespace| { namespace.destroyDecls(mod); } - decl.clearValues(gpa); + decl.clearValues(mod); } decl.dependants.deinit(gpa); decl.dependencies.deinit(gpa); @@ -4034,7 +4089,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool { if (decl.getFunction()) |prev_func| { prev_is_inline = prev_func.state == .inline_only; } - decl.clearValues(gpa); + decl.clearValues(mod); } decl.ty = try decl_tv.ty.copy(decl_arena_allocator); @@ -4080,7 +4135,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool { var type_changed = true; if (decl.has_tv) { type_changed = !decl.ty.eql(decl_tv.ty, mod); - decl.clearValues(gpa); + decl.clearValues(mod); } decl.owns_tv = false; @@ -4694,7 +4749,7 @@ pub fn clearDecl( if (decl.getInnerNamespace()) |namespace| { try namespace.deleteAllDecls(mod, outdated_decls); } - decl.clearValues(gpa); + decl.clearValues(mod); } if (decl.deletion_flag) { @@ -5623,7 +5678,7 @@ pub fn populateTestFunctions(mod: *Module) !void { // Since we are replacing the Decl's value we must perform cleanup on the // previous value. - decl.clearValues(gpa); + decl.clearValues(mod); decl.ty = new_ty; decl.val = new_val; decl.has_tv = true; diff --git a/src/Sema.zig b/src/Sema.zig index cb34fd158f..d3fca6d2b2 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -3842,20 +3842,44 @@ fn zirStr(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Ins fn addStrLit(sema: *Sema, block: *Block, zir_bytes: []const u8) CompileError!Air.Inst.Ref { // `zir_bytes` references memory inside the ZIR module, which can get deallocated // after semantic analysis is complete, for example in the case of the initialization - // expression of a variable declaration. We need the memory to be in the new - // anonymous Decl's arena. - var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded); - defer anon_decl.deinit(); + // expression of a variable declaration. + const mod = sema.mod; + const gpa = sema.gpa; + const string_bytes = &mod.string_literal_bytes; + const StringLiteralAdapter = Module.StringLiteralAdapter; + const StringLiteralContext = Module.StringLiteralContext; + try string_bytes.ensureUnusedCapacity(gpa, zir_bytes.len); + const gop = try mod.string_literal_table.getOrPutContextAdapted(gpa, zir_bytes, StringLiteralAdapter{ + .bytes = string_bytes, + }, StringLiteralContext{ + .bytes = string_bytes, + }); + if (!gop.found_existing) { + gop.key_ptr.* = .{ + .index = @intCast(u32, string_bytes.items.len), + .len = @intCast(u32, zir_bytes.len), + }; + string_bytes.appendSliceAssumeCapacity(zir_bytes); + gop.value_ptr.* = .none; + } + const decl_index = gop.value_ptr.unwrap() orelse di: { + var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded); + defer anon_decl.deinit(); - const bytes = try anon_decl.arena().dupeZ(u8, zir_bytes); + const decl_index = try anon_decl.finish( + try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), gop.key_ptr.len), + try Value.Tag.str_lit.create(anon_decl.arena(), gop.key_ptr.*), + 0, // default alignment + ); - const new_decl = try anon_decl.finish( - try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), bytes.len), - try Value.Tag.bytes.create(anon_decl.arena(), bytes[0 .. bytes.len + 1]), - 0, // default alignment - ); + // Needed so that `Decl.clearValues` will additionally set the corresponding + // string literal table value back to `Decl.OptionalIndex.none`. + mod.declPtr(decl_index).owns_tv = true; - return sema.analyzeDeclRef(new_decl); + gop.value_ptr.* = decl_index.toOptional(); + break :di decl_index; + }; + return sema.analyzeDeclRef(decl_index); } fn zirInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref { @@ -19762,6 +19786,35 @@ fn beginComptimePtrMutation( .ty = elem_ty, }; }, + .str_lit => { + // An array is memory-optimized to store a slice of bytes, but we are about + // to modify an individual field and the representation has to change. + // If we wanted to avoid this, there would need to be special detection + // elsewhere to identify when writing a value to an array element that is stored + // using the `str_lit` tag, and handle it without making a call to this function. + const arena = parent.beginArena(sema.mod); + defer parent.finishArena(sema.mod); + + const str_lit = parent.val.castTag(.str_lit).?.data; + const dest_len = parent.ty.arrayLenIncludingSentinel(); + const bytes = sema.mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + const elems = try arena.alloc(Value, @intCast(usize, dest_len)); + for (bytes) |byte, i| { + elems[i] = try Value.Tag.int_u64.create(arena, byte); + } + if (parent.ty.sentinel()) |sent_val| { + assert(elems.len == bytes.len + 1); + elems[bytes.len] = sent_val; + } + + parent.val.* = try Value.Tag.aggregate.create(arena, elems); + + return ComptimePtrMutationKit{ + .decl_ref_mut = parent.decl_ref_mut, + .val = &elems[elem_ptr.index], + .ty = elem_ty, + }; + }, .repeated => { // An array is memory-optimized to store only a single element value, and // that value is understood to be the same for the entire length of the array. @@ -20097,10 +20150,23 @@ fn beginComptimePtrLoad( } } - deref.pointee = if (elem_ptr.index < check_len) TypedValue{ + if (elem_ptr.index >= check_len) { + deref.pointee = null; + break :blk deref; + } + if (elem_ptr.index == check_len - 1) { + if (array_tv.ty.sentinel()) |sent| { + deref.pointee = TypedValue{ + .ty = elem_ty, + .val = sent, + }; + break :blk deref; + } + } + deref.pointee = TypedValue{ .ty = elem_ty, .val = try array_tv.val.elemValue(sema.mod, sema.arena, elem_ptr.index), - } else null; + }; break :blk deref; }, diff --git a/src/TypedValue.zig b/src/TypedValue.zig index b6aee29a4b..9f69e4c8bd 100644 --- a/src/TypedValue.zig +++ b/src/TypedValue.zig @@ -295,6 +295,11 @@ pub fn print( return writer.print(".{s}", .{ty.enumFieldName(val.castTag(.enum_field_index).?.data)}); }, .bytes => return writer.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}), + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + return writer.print("\"{}\"", .{std.zig.fmtEscapes(bytes)}); + }, .repeated => { if (level == 0) { return writer.writeAll(".{ ... }"); diff --git a/src/codegen.zig b/src/codegen.zig index def69d952f..bd556baa5e 100644 --- a/src/codegen.zig +++ b/src/codegen.zig @@ -203,11 +203,23 @@ pub fn generateSymbol( }, .Array => switch (typed_value.val.tag()) { .bytes => { - const payload = typed_value.val.castTag(.bytes).?; + const bytes = typed_value.val.castTag(.bytes).?.data; const len = @intCast(usize, typed_value.ty.arrayLenIncludingSentinel()); // The bytes payload already includes the sentinel, if any try code.ensureUnusedCapacity(len); - code.appendSliceAssumeCapacity(payload.data[0..len]); + code.appendSliceAssumeCapacity(bytes[0..len]); + return Result{ .appended = {} }; + }, + .str_lit => { + const str_lit = typed_value.val.castTag(.str_lit).?.data; + const mod = bin_file.options.module.?; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + try code.ensureUnusedCapacity(bytes.len + 1); + code.appendSliceAssumeCapacity(bytes); + if (typed_value.ty.sentinel()) |sent_val| { + const byte = @intCast(u8, sent_val.toUnsignedInt(target)); + code.appendAssumeCapacity(byte); + } return Result{ .appended = {} }; }, .aggregate => { diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig index 6de001e5fd..ef33f39f55 100644 --- a/src/codegen/llvm.zig +++ b/src/codegen/llvm.zig @@ -2936,9 +2936,39 @@ pub const DeclGen = struct { return dg.context.constString( bytes.ptr, @intCast(c_uint, tv.ty.arrayLenIncludingSentinel()), - .True, // don't null terminate. bytes has the sentinel, if any. + .True, // Don't null terminate. Bytes has the sentinel, if any. ); }, + .str_lit => { + const str_lit = tv.val.castTag(.str_lit).?.data; + const bytes = dg.module.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + if (tv.ty.sentinel()) |sent_val| { + const byte = @intCast(u8, sent_val.toUnsignedInt(target)); + if (byte == 0 and bytes.len > 0) { + return dg.context.constString( + bytes.ptr, + @intCast(c_uint, bytes.len), + .False, // Yes, null terminate. + ); + } + var array = std.ArrayList(u8).init(dg.gpa); + defer array.deinit(); + try array.ensureUnusedCapacity(bytes.len + 1); + array.appendSliceAssumeCapacity(bytes); + array.appendAssumeCapacity(byte); + return dg.context.constString( + array.items.ptr, + @intCast(c_uint, array.items.len), + .True, // Don't null terminate. + ); + } else { + return dg.context.constString( + bytes.ptr, + @intCast(c_uint, bytes.len), + .True, // Don't null terminate. `bytes` has the sentinel, if any. + ); + } + }, .aggregate => { const elem_vals = tv.val.castTag(.aggregate).?.data; const elem_ty = tv.ty.elemType(); diff --git a/src/value.zig b/src/value.zig index 1280adf1e0..310384c3c3 100644 --- a/src/value.zig +++ b/src/value.zig @@ -126,6 +126,8 @@ pub const Value = extern union { field_ptr, /// A slice of u8 whose memory is managed externally. bytes, + /// Similar to bytes however it stores an index relative to `Module.string_literal_bytes`. + str_lit, /// This value is repeated some number of times. The amount of times to repeat /// is stored externally. repeated, @@ -285,6 +287,7 @@ pub const Value = extern union { .enum_literal, => Payload.Bytes, + .str_lit => Payload.StrLit, .slice => Payload.Slice, .enum_field_index => Payload.U32, @@ -538,6 +541,7 @@ pub const Value = extern union { }; return Value{ .ptr_otherwise = &new_payload.base }; }, + .str_lit => return self.copyPayloadShallow(arena, Payload.StrLit), .repeated, .eu_payload, .opt_payload, @@ -764,6 +768,12 @@ pub const Value = extern union { .enum_literal => return out_stream.print(".{}", .{std.zig.fmtId(val.castTag(.enum_literal).?.data)}), .enum_field_index => return out_stream.print("(enum field {d})", .{val.castTag(.enum_field_index).?.data}), .bytes => return out_stream.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}), + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + return out_stream.print("(.str_lit index={d} len={d})", .{ + str_lit.index, str_lit.len, + }); + }, .repeated => { try out_stream.writeAll("(repeated) "); val = val.castTag(.repeated).?.data; @@ -824,6 +834,11 @@ pub const Value = extern union { const adjusted_bytes = bytes[0..adjusted_len]; return allocator.dupe(u8, adjusted_bytes); }, + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + return allocator.dupe(u8, bytes); + }, .enum_literal => return allocator.dupe(u8, val.castTag(.enum_literal).?.data), .repeated => { const byte = @intCast(u8, val.castTag(.repeated).?.data.toUnsignedInt(target)); @@ -2537,6 +2552,20 @@ pub const Value = extern union { return initPayload(&buffer.base); } }, + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + const byte = bytes[index]; + if (arena) |a| { + return Tag.int_u64.create(a, byte); + } else { + buffer.* = .{ + .base = .{ .tag = .int_u64 }, + .data = byte, + }; + return initPayload(&buffer.base); + } + }, // No matter the index; all the elements are the same! .repeated => return val.castTag(.repeated).?.data, @@ -2570,6 +2599,13 @@ pub const Value = extern union { return switch (val.tag()) { .empty_array_sentinel => if (start == 0 and end == 1) val else Value.initTag(.empty_array), .bytes => Tag.bytes.create(arena, val.castTag(.bytes).?.data[start..end]), + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + return Tag.str_lit.create(arena, .{ + .index = @intCast(u32, str_lit.index + start), + .len = @intCast(u32, end - start), + }); + }, .aggregate => Tag.aggregate.create(arena, val.castTag(.aggregate).?.data[start..end]), .slice => sliceArray(val.castTag(.slice).?.data.ptr, mod, arena, start, end), @@ -4721,6 +4757,11 @@ pub const Value = extern union { data: []const u8, }; + pub const StrLit = struct { + base: Payload, + data: Module.StringLiteralContext.Key, + }; + pub const Aggregate = struct { base: Payload, /// Field values. The types are according to the struct or array type. diff --git a/test/behavior/eval.zig b/test/behavior/eval.zig index 88be755b61..e0e787509a 100644 --- a/test/behavior/eval.zig +++ b/test/behavior/eval.zig @@ -643,9 +643,18 @@ fn assertEqualPtrs(ptr1: *const u8, ptr2: *const u8) !void { try expect(ptr1 == ptr2); } +// This one is still up for debate in the language specification. +// Application code should not rely on this behavior until it is solidified. +// Currently, stage1 has special case code to make this pass for string literals +// but it does not work if the values are constructed with comptime code, or if +// arrays of non-u8 elements are used instead. +// The official language specification might not make this guarantee. However, if +// it does make this guarantee, it will make it consistently for all types, not +// only string literals. This is why stage2 currently has a string table for +// string literals, to match stage1 and pass this test, however the end-game once +// the lang spec issue is settled would be to use a global InternPool for comptime +// memoized objects, making this behavior consistent across all types. test "string literal used as comptime slice is memoized" { - if (builtin.zig_backend != .stage1) return error.SkipZigTest; // TODO - const a = "link"; const b = "link"; comptime try expect(TypeWithCompTimeSlice(a).Node == TypeWithCompTimeSlice(b).Node); From 953e2778d4402cf85b99061ef9bdb89aa3e5f2db Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 24 May 2022 01:20:18 -0700 Subject: [PATCH 2/2] clean up a compile error test case These should be build-obj, not build-exe, where possible. --- test/cases/llvm/pointer_with_different_address_spaces.zig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/cases/llvm/pointer_with_different_address_spaces.zig b/test/cases/llvm/pointer_with_different_address_spaces.zig index 8fbd8b07cf..26540862ff 100644 --- a/test/cases/llvm/pointer_with_different_address_spaces.zig +++ b/test/cases/llvm/pointer_with_different_address_spaces.zig @@ -1,12 +1,12 @@ fn entry(a: *addrspace(.gs) i32) *addrspace(.fs) i32 { return a; } -pub fn main() void { +export fn entry2() void { _ = entry; } // error -// output_mode=Exe +// output_mode=Obj // backend=stage2,llvm // target=x86_64-linux,x86_64-macos //