LLVM backend: optimize memset with comptime-known element

When the element is comptime-known, we can check if it has a repeated
byte representation. In this case, `@memset` can be lowered with the
LLVM intrinsic rather than with a loop.
This commit is contained in:
Andrew Kelley 2023-04-26 13:41:02 -07:00
parent 51adbf472b
commit 9295355985
4 changed files with 83 additions and 19 deletions

View File

@ -26953,9 +26953,11 @@ fn storePtrVal(
defer sema.gpa.free(buffer);
reinterpret.val_ptr.*.writeToMemory(mut_kit.ty, sema.mod, buffer) catch |err| switch (err) {
error.ReinterpretDeclRef => unreachable,
error.IllDefinedMemoryLayout => unreachable, // Sema was supposed to emit a compile error already
};
operand_val.writeToMemory(operand_ty, sema.mod, buffer[reinterpret.byte_offset..]) catch |err| switch (err) {
error.ReinterpretDeclRef => unreachable,
error.IllDefinedMemoryLayout => unreachable, // Sema was supposed to emit a compile error already
};
const arena = mut_kit.beginArena(sema.mod);
@ -27905,6 +27907,7 @@ fn bitCastVal(
defer sema.gpa.free(buffer);
val.writeToMemory(old_ty, sema.mod, buffer) catch |err| switch (err) {
error.ReinterpretDeclRef => return null,
error.IllDefinedMemoryLayout => unreachable, // Sema was supposed to emit a compile error already
};
return try Value.readFromMemory(new_ty, sema.mod, buffer[buffer_offset..], sema.arena);
}

View File

@ -8424,28 +8424,45 @@ pub const FuncGen = struct {
const dest_slice = try self.resolveInst(bin_op.lhs);
const ptr_ty = self.air.typeOf(bin_op.lhs);
const elem_ty = self.air.typeOf(bin_op.rhs);
const target = self.dg.module.getTarget();
const val_is_undef = if (self.air.value(bin_op.rhs)) |val| val.isUndefDeep() else false;
const module = self.dg.module;
const target = module.getTarget();
const dest_ptr_align = ptr_ty.ptrAlignment(target);
const u8_llvm_ty = self.context.intType(8);
const dest_ptr = self.sliceOrArrayPtr(dest_slice, ptr_ty);
const is_volatile = ptr_ty.isVolatilePtr();
if (val_is_undef) {
// Even if safety is disabled, we still emit a memset to undefined since it conveys
// extra information to LLVM. However, safety makes the difference between using
// 0xaa or actual undefined for the fill byte.
const fill_byte = if (safety)
u8_llvm_ty.constInt(0xaa, .False)
else
u8_llvm_ty.getUndef();
const len = self.sliceOrArrayLenInBytes(dest_slice, ptr_ty);
_ = self.builder.buildMemSet(dest_ptr, fill_byte, len, dest_ptr_align, is_volatile);
if (self.air.value(bin_op.rhs)) |elem_val| {
if (elem_val.isUndefDeep()) {
// Even if safety is disabled, we still emit a memset to undefined since it conveys
// extra information to LLVM. However, safety makes the difference between using
// 0xaa or actual undefined for the fill byte.
const fill_byte = if (safety)
u8_llvm_ty.constInt(0xaa, .False)
else
u8_llvm_ty.getUndef();
const len = self.sliceOrArrayLenInBytes(dest_slice, ptr_ty);
_ = self.builder.buildMemSet(dest_ptr, fill_byte, len, dest_ptr_align, is_volatile);
if (safety and self.dg.module.comp.bin_file.options.valgrind) {
self.valgrindMarkUndef(dest_ptr, len);
if (safety and module.comp.bin_file.options.valgrind) {
self.valgrindMarkUndef(dest_ptr, len);
}
return null;
}
// Test if the element value is compile-time known to be a
// repeating byte pattern, for example, `@as(u64, 0)` has a
// repeating byte pattern of 0 bytes. In such case, the memset
// intrinsic can be used.
var value_buffer: Value.Payload.U64 = undefined;
if (try elem_val.hasRepeatedByteRepr(elem_ty, module, &value_buffer)) |byte_val| {
const fill_byte = try self.resolveValue(.{
.ty = Type.u8,
.val = byte_val,
});
const len = self.sliceOrArrayLenInBytes(dest_slice, ptr_ty);
_ = self.builder.buildMemSet(dest_ptr, fill_byte, len, dest_ptr_align, is_volatile);
return null;
}
return null;
}
const value = try self.resolveInst(bin_op.rhs);

View File

@ -1278,7 +1278,10 @@ pub const Value = extern union {
///
/// Asserts that buffer.len >= ty.abiSize(). The buffer is allowed to extend past
/// the end of the value in memory.
pub fn writeToMemory(val: Value, ty: Type, mod: *Module, buffer: []u8) error{ReinterpretDeclRef}!void {
pub fn writeToMemory(val: Value, ty: Type, mod: *Module, buffer: []u8) error{
ReinterpretDeclRef,
IllDefinedMemoryLayout,
}!void {
const target = mod.getTarget();
const endian = target.cpu.arch.endian();
if (val.isUndef()) {
@ -1345,7 +1348,7 @@ pub const Value = extern union {
return writeToPackedMemory(val, ty, mod, buffer[0..byte_count], 0);
},
.Struct => switch (ty.containerLayout()) {
.Auto => unreachable, // Sema is supposed to have emitted a compile error already
.Auto => return error.IllDefinedMemoryLayout,
.Extern => {
const fields = ty.structFields().values();
const field_vals = val.castTag(.aggregate).?.data;
@ -1366,7 +1369,7 @@ pub const Value = extern union {
std.mem.writeInt(Int, buffer[0..@sizeOf(Int)], @intCast(Int, int), endian);
},
.Union => switch (ty.containerLayout()) {
.Auto => unreachable,
.Auto => return error.IllDefinedMemoryLayout,
.Extern => @panic("TODO implement writeToMemory for extern unions"),
.Packed => {
const byte_count = (@intCast(usize, ty.bitSize(target)) + 7) / 8;
@ -5381,6 +5384,35 @@ pub const Value = extern union {
}
}
/// If the value is represented in-memory as a series of bytes that all
/// have the same value, return that byte value, otherwise null.
pub fn hasRepeatedByteRepr(val: Value, ty: Type, mod: *Module, value_buffer: *Payload.U64) !?Value {
const target = mod.getTarget();
const abi_size = ty.abiSize(target);
assert(abi_size >= 1);
const byte_buffer = try mod.gpa.alloc(u8, abi_size);
defer mod.gpa.free(byte_buffer);
writeToMemory(val, ty, mod, byte_buffer) catch |err| switch (err) {
error.ReinterpretDeclRef => return null,
// TODO: The writeToMemory function was originally created for the purpose
// of comptime pointer casting. However, it is now additionally being used
// for checking the actual memory layout that will be generated by machine
// code late in compilation. So, this error handling is too aggressive and
// causes some false negatives, causing less-than-ideal code generation.
error.IllDefinedMemoryLayout => return null,
};
const first_byte = byte_buffer[0];
for (byte_buffer[1..]) |byte| {
if (byte != first_byte) return null;
}
value_buffer.* = .{
.base = .{ .tag = .int_u64 },
.data = first_byte,
};
return initPayload(&value_buffer.base);
}
/// This type is not copyable since it may contain pointers to its inner data.
pub const Payload = struct {
tag: Tag,

View File

@ -94,7 +94,7 @@ test "memset with 1-byte array element" {
try expect(buf[4][0]);
}
test "memset with large array element" {
test "memset with large array element, runtime known" {
const A = [128]u64;
var buf: [5]A = undefined;
var runtime_known_element = [_]u64{0} ** 128;
@ -106,6 +106,18 @@ test "memset with large array element" {
for (buf[4]) |elem| try expect(elem == 0);
}
test "memset with large array element, comptime known" {
const A = [128]u64;
var buf: [5]A = undefined;
const comptime_known_element = [_]u64{0} ** 128;
@memset(&buf, comptime_known_element);
for (buf[0]) |elem| try expect(elem == 0);
for (buf[1]) |elem| try expect(elem == 0);
for (buf[2]) |elem| try expect(elem == 0);
for (buf[3]) |elem| try expect(elem == 0);
for (buf[4]) |elem| try expect(elem == 0);
}
test "memcpy and memset intrinsics" {
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;