mirror of
https://github.com/ziglang/zig.git
synced 2026-02-12 12:27:41 +00:00
Merge pull request #11706 from ziglang/string-literal-interning
stage2: string literal interning
This commit is contained in:
commit
60f0acd9b9
@ -70,6 +70,17 @@ import_table: std.StringArrayHashMapUnmanaged(*File) = .{},
|
||||
/// Keys are fully resolved file paths. This table owns the keys and values.
|
||||
embed_table: std.StringHashMapUnmanaged(*EmbedFile) = .{},
|
||||
|
||||
/// This is a temporary addition to stage2 in order to match stage1 behavior,
|
||||
/// however the end-game once the lang spec is settled will be to use a global
|
||||
/// InternPool for comptime memoized objects, making this behavior consistent across all types,
|
||||
/// not only string literals. Or, we might decide to not guarantee string literals
|
||||
/// to have equal comptime pointers, in which case this field can be deleted (perhaps
|
||||
/// the commit that introduced it can simply be reverted).
|
||||
/// This table uses an optional index so that when a Decl is destroyed, the string literal
|
||||
/// is still reclaimable by a future Decl.
|
||||
string_literal_table: std.HashMapUnmanaged(StringLiteralContext.Key, Decl.OptionalIndex, StringLiteralContext, std.hash_map.default_max_load_percentage) = .{},
|
||||
string_literal_bytes: std.ArrayListUnmanaged(u8) = .{},
|
||||
|
||||
/// The set of all the generic function instantiations. This is used so that when a generic
|
||||
/// function is called twice with the same comptime parameter arguments, both calls dispatch
|
||||
/// to the same function.
|
||||
@ -157,6 +168,39 @@ decls_free_list: std.ArrayListUnmanaged(Decl.Index) = .{},
|
||||
|
||||
global_assembly: std.AutoHashMapUnmanaged(Decl.Index, []u8) = .{},
|
||||
|
||||
pub const StringLiteralContext = struct {
|
||||
bytes: *std.ArrayListUnmanaged(u8),
|
||||
|
||||
pub const Key = struct {
|
||||
index: u32,
|
||||
len: u32,
|
||||
};
|
||||
|
||||
pub fn eql(self: @This(), a: Key, b: Key) bool {
|
||||
_ = self;
|
||||
return a.index == b.index and a.len == b.len;
|
||||
}
|
||||
|
||||
pub fn hash(self: @This(), x: Key) u64 {
|
||||
const x_slice = self.bytes.items[x.index..][0..x.len];
|
||||
return std.hash_map.hashString(x_slice);
|
||||
}
|
||||
};
|
||||
|
||||
pub const StringLiteralAdapter = struct {
|
||||
bytes: *std.ArrayListUnmanaged(u8),
|
||||
|
||||
pub fn eql(self: @This(), a_slice: []const u8, b: StringLiteralContext.Key) bool {
|
||||
const b_slice = self.bytes.items[b.index..][0..b.len];
|
||||
return mem.eql(u8, a_slice, b_slice);
|
||||
}
|
||||
|
||||
pub fn hash(self: @This(), adapted_key: []const u8) u64 {
|
||||
_ = self;
|
||||
return std.hash_map.hashString(adapted_key);
|
||||
}
|
||||
};
|
||||
|
||||
const MonomorphedFuncsSet = std.HashMapUnmanaged(
|
||||
*Fn,
|
||||
void,
|
||||
@ -507,7 +551,8 @@ pub const Decl = struct {
|
||||
decl.name = undefined;
|
||||
}
|
||||
|
||||
pub fn clearValues(decl: *Decl, gpa: Allocator) void {
|
||||
pub fn clearValues(decl: *Decl, mod: *Module) void {
|
||||
const gpa = mod.gpa;
|
||||
if (decl.getExternFn()) |extern_fn| {
|
||||
extern_fn.deinit(gpa);
|
||||
gpa.destroy(extern_fn);
|
||||
@ -521,6 +566,13 @@ pub const Decl = struct {
|
||||
gpa.destroy(variable);
|
||||
}
|
||||
if (decl.value_arena) |arena_state| {
|
||||
if (decl.owns_tv) {
|
||||
if (decl.val.castTag(.str_lit)) |str_lit| {
|
||||
mod.string_literal_table.getPtrContext(str_lit.data, .{
|
||||
.bytes = &mod.string_literal_bytes,
|
||||
}).?.* = .none;
|
||||
}
|
||||
}
|
||||
arena_state.promote(gpa).deinit();
|
||||
decl.value_arena = null;
|
||||
decl.has_tv = false;
|
||||
@ -2839,6 +2891,9 @@ pub fn deinit(mod: *Module) void {
|
||||
mod.decls_free_list.deinit(gpa);
|
||||
mod.allocated_decls.deinit(gpa);
|
||||
mod.global_assembly.deinit(gpa);
|
||||
|
||||
mod.string_literal_table.deinit(gpa);
|
||||
mod.string_literal_bytes.deinit(gpa);
|
||||
}
|
||||
|
||||
pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
|
||||
@ -2857,7 +2912,7 @@ pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
|
||||
if (decl.getInnerNamespace()) |namespace| {
|
||||
namespace.destroyDecls(mod);
|
||||
}
|
||||
decl.clearValues(gpa);
|
||||
decl.clearValues(mod);
|
||||
}
|
||||
decl.dependants.deinit(gpa);
|
||||
decl.dependencies.deinit(gpa);
|
||||
@ -4034,7 +4089,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool {
|
||||
if (decl.getFunction()) |prev_func| {
|
||||
prev_is_inline = prev_func.state == .inline_only;
|
||||
}
|
||||
decl.clearValues(gpa);
|
||||
decl.clearValues(mod);
|
||||
}
|
||||
|
||||
decl.ty = try decl_tv.ty.copy(decl_arena_allocator);
|
||||
@ -4080,7 +4135,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool {
|
||||
var type_changed = true;
|
||||
if (decl.has_tv) {
|
||||
type_changed = !decl.ty.eql(decl_tv.ty, mod);
|
||||
decl.clearValues(gpa);
|
||||
decl.clearValues(mod);
|
||||
}
|
||||
|
||||
decl.owns_tv = false;
|
||||
@ -4694,7 +4749,7 @@ pub fn clearDecl(
|
||||
if (decl.getInnerNamespace()) |namespace| {
|
||||
try namespace.deleteAllDecls(mod, outdated_decls);
|
||||
}
|
||||
decl.clearValues(gpa);
|
||||
decl.clearValues(mod);
|
||||
}
|
||||
|
||||
if (decl.deletion_flag) {
|
||||
@ -5623,7 +5678,7 @@ pub fn populateTestFunctions(mod: *Module) !void {
|
||||
|
||||
// Since we are replacing the Decl's value we must perform cleanup on the
|
||||
// previous value.
|
||||
decl.clearValues(gpa);
|
||||
decl.clearValues(mod);
|
||||
decl.ty = new_ty;
|
||||
decl.val = new_val;
|
||||
decl.has_tv = true;
|
||||
|
||||
92
src/Sema.zig
92
src/Sema.zig
@ -3842,20 +3842,44 @@ fn zirStr(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Ins
|
||||
fn addStrLit(sema: *Sema, block: *Block, zir_bytes: []const u8) CompileError!Air.Inst.Ref {
|
||||
// `zir_bytes` references memory inside the ZIR module, which can get deallocated
|
||||
// after semantic analysis is complete, for example in the case of the initialization
|
||||
// expression of a variable declaration. We need the memory to be in the new
|
||||
// anonymous Decl's arena.
|
||||
var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded);
|
||||
defer anon_decl.deinit();
|
||||
// expression of a variable declaration.
|
||||
const mod = sema.mod;
|
||||
const gpa = sema.gpa;
|
||||
const string_bytes = &mod.string_literal_bytes;
|
||||
const StringLiteralAdapter = Module.StringLiteralAdapter;
|
||||
const StringLiteralContext = Module.StringLiteralContext;
|
||||
try string_bytes.ensureUnusedCapacity(gpa, zir_bytes.len);
|
||||
const gop = try mod.string_literal_table.getOrPutContextAdapted(gpa, zir_bytes, StringLiteralAdapter{
|
||||
.bytes = string_bytes,
|
||||
}, StringLiteralContext{
|
||||
.bytes = string_bytes,
|
||||
});
|
||||
if (!gop.found_existing) {
|
||||
gop.key_ptr.* = .{
|
||||
.index = @intCast(u32, string_bytes.items.len),
|
||||
.len = @intCast(u32, zir_bytes.len),
|
||||
};
|
||||
string_bytes.appendSliceAssumeCapacity(zir_bytes);
|
||||
gop.value_ptr.* = .none;
|
||||
}
|
||||
const decl_index = gop.value_ptr.unwrap() orelse di: {
|
||||
var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded);
|
||||
defer anon_decl.deinit();
|
||||
|
||||
const bytes = try anon_decl.arena().dupeZ(u8, zir_bytes);
|
||||
const decl_index = try anon_decl.finish(
|
||||
try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), gop.key_ptr.len),
|
||||
try Value.Tag.str_lit.create(anon_decl.arena(), gop.key_ptr.*),
|
||||
0, // default alignment
|
||||
);
|
||||
|
||||
const new_decl = try anon_decl.finish(
|
||||
try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), bytes.len),
|
||||
try Value.Tag.bytes.create(anon_decl.arena(), bytes[0 .. bytes.len + 1]),
|
||||
0, // default alignment
|
||||
);
|
||||
// Needed so that `Decl.clearValues` will additionally set the corresponding
|
||||
// string literal table value back to `Decl.OptionalIndex.none`.
|
||||
mod.declPtr(decl_index).owns_tv = true;
|
||||
|
||||
return sema.analyzeDeclRef(new_decl);
|
||||
gop.value_ptr.* = decl_index.toOptional();
|
||||
break :di decl_index;
|
||||
};
|
||||
return sema.analyzeDeclRef(decl_index);
|
||||
}
|
||||
|
||||
fn zirInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
|
||||
@ -19762,6 +19786,35 @@ fn beginComptimePtrMutation(
|
||||
.ty = elem_ty,
|
||||
};
|
||||
},
|
||||
.str_lit => {
|
||||
// An array is memory-optimized to store a slice of bytes, but we are about
|
||||
// to modify an individual field and the representation has to change.
|
||||
// If we wanted to avoid this, there would need to be special detection
|
||||
// elsewhere to identify when writing a value to an array element that is stored
|
||||
// using the `str_lit` tag, and handle it without making a call to this function.
|
||||
const arena = parent.beginArena(sema.mod);
|
||||
defer parent.finishArena(sema.mod);
|
||||
|
||||
const str_lit = parent.val.castTag(.str_lit).?.data;
|
||||
const dest_len = parent.ty.arrayLenIncludingSentinel();
|
||||
const bytes = sema.mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
|
||||
const elems = try arena.alloc(Value, @intCast(usize, dest_len));
|
||||
for (bytes) |byte, i| {
|
||||
elems[i] = try Value.Tag.int_u64.create(arena, byte);
|
||||
}
|
||||
if (parent.ty.sentinel()) |sent_val| {
|
||||
assert(elems.len == bytes.len + 1);
|
||||
elems[bytes.len] = sent_val;
|
||||
}
|
||||
|
||||
parent.val.* = try Value.Tag.aggregate.create(arena, elems);
|
||||
|
||||
return ComptimePtrMutationKit{
|
||||
.decl_ref_mut = parent.decl_ref_mut,
|
||||
.val = &elems[elem_ptr.index],
|
||||
.ty = elem_ty,
|
||||
};
|
||||
},
|
||||
.repeated => {
|
||||
// An array is memory-optimized to store only a single element value, and
|
||||
// that value is understood to be the same for the entire length of the array.
|
||||
@ -20097,10 +20150,23 @@ fn beginComptimePtrLoad(
|
||||
}
|
||||
}
|
||||
|
||||
deref.pointee = if (elem_ptr.index < check_len) TypedValue{
|
||||
if (elem_ptr.index >= check_len) {
|
||||
deref.pointee = null;
|
||||
break :blk deref;
|
||||
}
|
||||
if (elem_ptr.index == check_len - 1) {
|
||||
if (array_tv.ty.sentinel()) |sent| {
|
||||
deref.pointee = TypedValue{
|
||||
.ty = elem_ty,
|
||||
.val = sent,
|
||||
};
|
||||
break :blk deref;
|
||||
}
|
||||
}
|
||||
deref.pointee = TypedValue{
|
||||
.ty = elem_ty,
|
||||
.val = try array_tv.val.elemValue(sema.mod, sema.arena, elem_ptr.index),
|
||||
} else null;
|
||||
};
|
||||
break :blk deref;
|
||||
},
|
||||
|
||||
|
||||
@ -295,6 +295,11 @@ pub fn print(
|
||||
return writer.print(".{s}", .{ty.enumFieldName(val.castTag(.enum_field_index).?.data)});
|
||||
},
|
||||
.bytes => return writer.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}),
|
||||
.str_lit => {
|
||||
const str_lit = val.castTag(.str_lit).?.data;
|
||||
const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
|
||||
return writer.print("\"{}\"", .{std.zig.fmtEscapes(bytes)});
|
||||
},
|
||||
.repeated => {
|
||||
if (level == 0) {
|
||||
return writer.writeAll(".{ ... }");
|
||||
|
||||
@ -203,11 +203,23 @@ pub fn generateSymbol(
|
||||
},
|
||||
.Array => switch (typed_value.val.tag()) {
|
||||
.bytes => {
|
||||
const payload = typed_value.val.castTag(.bytes).?;
|
||||
const bytes = typed_value.val.castTag(.bytes).?.data;
|
||||
const len = @intCast(usize, typed_value.ty.arrayLenIncludingSentinel());
|
||||
// The bytes payload already includes the sentinel, if any
|
||||
try code.ensureUnusedCapacity(len);
|
||||
code.appendSliceAssumeCapacity(payload.data[0..len]);
|
||||
code.appendSliceAssumeCapacity(bytes[0..len]);
|
||||
return Result{ .appended = {} };
|
||||
},
|
||||
.str_lit => {
|
||||
const str_lit = typed_value.val.castTag(.str_lit).?.data;
|
||||
const mod = bin_file.options.module.?;
|
||||
const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
|
||||
try code.ensureUnusedCapacity(bytes.len + 1);
|
||||
code.appendSliceAssumeCapacity(bytes);
|
||||
if (typed_value.ty.sentinel()) |sent_val| {
|
||||
const byte = @intCast(u8, sent_val.toUnsignedInt(target));
|
||||
code.appendAssumeCapacity(byte);
|
||||
}
|
||||
return Result{ .appended = {} };
|
||||
},
|
||||
.aggregate => {
|
||||
|
||||
@ -2936,9 +2936,39 @@ pub const DeclGen = struct {
|
||||
return dg.context.constString(
|
||||
bytes.ptr,
|
||||
@intCast(c_uint, tv.ty.arrayLenIncludingSentinel()),
|
||||
.True, // don't null terminate. bytes has the sentinel, if any.
|
||||
.True, // Don't null terminate. Bytes has the sentinel, if any.
|
||||
);
|
||||
},
|
||||
.str_lit => {
|
||||
const str_lit = tv.val.castTag(.str_lit).?.data;
|
||||
const bytes = dg.module.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
|
||||
if (tv.ty.sentinel()) |sent_val| {
|
||||
const byte = @intCast(u8, sent_val.toUnsignedInt(target));
|
||||
if (byte == 0 and bytes.len > 0) {
|
||||
return dg.context.constString(
|
||||
bytes.ptr,
|
||||
@intCast(c_uint, bytes.len),
|
||||
.False, // Yes, null terminate.
|
||||
);
|
||||
}
|
||||
var array = std.ArrayList(u8).init(dg.gpa);
|
||||
defer array.deinit();
|
||||
try array.ensureUnusedCapacity(bytes.len + 1);
|
||||
array.appendSliceAssumeCapacity(bytes);
|
||||
array.appendAssumeCapacity(byte);
|
||||
return dg.context.constString(
|
||||
array.items.ptr,
|
||||
@intCast(c_uint, array.items.len),
|
||||
.True, // Don't null terminate.
|
||||
);
|
||||
} else {
|
||||
return dg.context.constString(
|
||||
bytes.ptr,
|
||||
@intCast(c_uint, bytes.len),
|
||||
.True, // Don't null terminate. `bytes` has the sentinel, if any.
|
||||
);
|
||||
}
|
||||
},
|
||||
.aggregate => {
|
||||
const elem_vals = tv.val.castTag(.aggregate).?.data;
|
||||
const elem_ty = tv.ty.elemType();
|
||||
|
||||
@ -126,6 +126,8 @@ pub const Value = extern union {
|
||||
field_ptr,
|
||||
/// A slice of u8 whose memory is managed externally.
|
||||
bytes,
|
||||
/// Similar to bytes however it stores an index relative to `Module.string_literal_bytes`.
|
||||
str_lit,
|
||||
/// This value is repeated some number of times. The amount of times to repeat
|
||||
/// is stored externally.
|
||||
repeated,
|
||||
@ -285,6 +287,7 @@ pub const Value = extern union {
|
||||
.enum_literal,
|
||||
=> Payload.Bytes,
|
||||
|
||||
.str_lit => Payload.StrLit,
|
||||
.slice => Payload.Slice,
|
||||
|
||||
.enum_field_index => Payload.U32,
|
||||
@ -538,6 +541,7 @@ pub const Value = extern union {
|
||||
};
|
||||
return Value{ .ptr_otherwise = &new_payload.base };
|
||||
},
|
||||
.str_lit => return self.copyPayloadShallow(arena, Payload.StrLit),
|
||||
.repeated,
|
||||
.eu_payload,
|
||||
.opt_payload,
|
||||
@ -764,6 +768,12 @@ pub const Value = extern union {
|
||||
.enum_literal => return out_stream.print(".{}", .{std.zig.fmtId(val.castTag(.enum_literal).?.data)}),
|
||||
.enum_field_index => return out_stream.print("(enum field {d})", .{val.castTag(.enum_field_index).?.data}),
|
||||
.bytes => return out_stream.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}),
|
||||
.str_lit => {
|
||||
const str_lit = val.castTag(.str_lit).?.data;
|
||||
return out_stream.print("(.str_lit index={d} len={d})", .{
|
||||
str_lit.index, str_lit.len,
|
||||
});
|
||||
},
|
||||
.repeated => {
|
||||
try out_stream.writeAll("(repeated) ");
|
||||
val = val.castTag(.repeated).?.data;
|
||||
@ -824,6 +834,11 @@ pub const Value = extern union {
|
||||
const adjusted_bytes = bytes[0..adjusted_len];
|
||||
return allocator.dupe(u8, adjusted_bytes);
|
||||
},
|
||||
.str_lit => {
|
||||
const str_lit = val.castTag(.str_lit).?.data;
|
||||
const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
|
||||
return allocator.dupe(u8, bytes);
|
||||
},
|
||||
.enum_literal => return allocator.dupe(u8, val.castTag(.enum_literal).?.data),
|
||||
.repeated => {
|
||||
const byte = @intCast(u8, val.castTag(.repeated).?.data.toUnsignedInt(target));
|
||||
@ -2537,6 +2552,20 @@ pub const Value = extern union {
|
||||
return initPayload(&buffer.base);
|
||||
}
|
||||
},
|
||||
.str_lit => {
|
||||
const str_lit = val.castTag(.str_lit).?.data;
|
||||
const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
|
||||
const byte = bytes[index];
|
||||
if (arena) |a| {
|
||||
return Tag.int_u64.create(a, byte);
|
||||
} else {
|
||||
buffer.* = .{
|
||||
.base = .{ .tag = .int_u64 },
|
||||
.data = byte,
|
||||
};
|
||||
return initPayload(&buffer.base);
|
||||
}
|
||||
},
|
||||
|
||||
// No matter the index; all the elements are the same!
|
||||
.repeated => return val.castTag(.repeated).?.data,
|
||||
@ -2570,6 +2599,13 @@ pub const Value = extern union {
|
||||
return switch (val.tag()) {
|
||||
.empty_array_sentinel => if (start == 0 and end == 1) val else Value.initTag(.empty_array),
|
||||
.bytes => Tag.bytes.create(arena, val.castTag(.bytes).?.data[start..end]),
|
||||
.str_lit => {
|
||||
const str_lit = val.castTag(.str_lit).?.data;
|
||||
return Tag.str_lit.create(arena, .{
|
||||
.index = @intCast(u32, str_lit.index + start),
|
||||
.len = @intCast(u32, end - start),
|
||||
});
|
||||
},
|
||||
.aggregate => Tag.aggregate.create(arena, val.castTag(.aggregate).?.data[start..end]),
|
||||
.slice => sliceArray(val.castTag(.slice).?.data.ptr, mod, arena, start, end),
|
||||
|
||||
@ -4721,6 +4757,11 @@ pub const Value = extern union {
|
||||
data: []const u8,
|
||||
};
|
||||
|
||||
pub const StrLit = struct {
|
||||
base: Payload,
|
||||
data: Module.StringLiteralContext.Key,
|
||||
};
|
||||
|
||||
pub const Aggregate = struct {
|
||||
base: Payload,
|
||||
/// Field values. The types are according to the struct or array type.
|
||||
|
||||
@ -643,9 +643,18 @@ fn assertEqualPtrs(ptr1: *const u8, ptr2: *const u8) !void {
|
||||
try expect(ptr1 == ptr2);
|
||||
}
|
||||
|
||||
// This one is still up for debate in the language specification.
|
||||
// Application code should not rely on this behavior until it is solidified.
|
||||
// Currently, stage1 has special case code to make this pass for string literals
|
||||
// but it does not work if the values are constructed with comptime code, or if
|
||||
// arrays of non-u8 elements are used instead.
|
||||
// The official language specification might not make this guarantee. However, if
|
||||
// it does make this guarantee, it will make it consistently for all types, not
|
||||
// only string literals. This is why stage2 currently has a string table for
|
||||
// string literals, to match stage1 and pass this test, however the end-game once
|
||||
// the lang spec issue is settled would be to use a global InternPool for comptime
|
||||
// memoized objects, making this behavior consistent across all types.
|
||||
test "string literal used as comptime slice is memoized" {
|
||||
if (builtin.zig_backend != .stage1) return error.SkipZigTest; // TODO
|
||||
|
||||
const a = "link";
|
||||
const b = "link";
|
||||
comptime try expect(TypeWithCompTimeSlice(a).Node == TypeWithCompTimeSlice(b).Node);
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
fn entry(a: *addrspace(.gs) i32) *addrspace(.fs) i32 {
|
||||
return a;
|
||||
}
|
||||
pub fn main() void {
|
||||
export fn entry2() void {
|
||||
_ = entry;
|
||||
}
|
||||
|
||||
// error
|
||||
// output_mode=Exe
|
||||
// output_mode=Obj
|
||||
// backend=stage2,llvm
|
||||
// target=x86_64-linux,x86_64-macos
|
||||
//
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user