From bf89482f23aae3524c92d385d6d8241d065d1031 Mon Sep 17 00:00:00 2001 From: dweiller <4678790+dweiller@users.noreply.github.com> Date: Mon, 20 Jan 2025 17:16:24 +1100 Subject: [PATCH 1/3] compiler-rt: move memmove back to memmove.zig --- lib/compiler_rt.zig | 1 + lib/compiler_rt/memcpy.zig | 140 +---------------------------------- lib/compiler_rt/memmove.zig | 144 ++++++++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 138 deletions(-) create mode 100644 lib/compiler_rt/memmove.zig diff --git a/lib/compiler_rt.zig b/lib/compiler_rt.zig index 1369e0a7e5..82aeb7f88e 100644 --- a/lib/compiler_rt.zig +++ b/lib/compiler_rt.zig @@ -233,6 +233,7 @@ comptime { _ = @import("compiler_rt/memcpy.zig"); _ = @import("compiler_rt/memset.zig"); + _ = @import("compiler_rt/memmove.zig"); _ = @import("compiler_rt/memcmp.zig"); _ = @import("compiler_rt/bcmp.zig"); _ = @import("compiler_rt/ssp.zig"); diff --git a/lib/compiler_rt/memcpy.zig b/lib/compiler_rt/memcpy.zig index 614bcc7182..82c41c101e 100644 --- a/lib/compiler_rt/memcpy.zig +++ b/lib/compiler_rt/memcpy.zig @@ -5,145 +5,9 @@ const builtin = @import("builtin"); comptime { if (builtin.object_format != .c) { @export(&memcpy, .{ .name = "memcpy", .linkage = common.linkage, .visibility = common.visibility }); - @export(&memmove, .{ .name = "memmove", .linkage = common.linkage, .visibility = common.visibility }); } } -fn memcpy(noalias opt_dest: ?[*]u8, noalias opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { - return memmove(opt_dest, opt_src, len); -} - -fn memmove(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { - // a port of https://github.com/facebook/folly/blob/1c8bc50e88804e2a7361a57cd9b551dd10f6c5fd/folly/memcpy.S - if (len == 0) { - @branchHint(.unlikely); - return opt_dest; - } - - const dest = opt_dest.?; - const src = opt_src.?; - - if (len < 8) { - @branchHint(.unlikely); - if (len == 1) { - @branchHint(.unlikely); - dest[0] = src[0]; - } else if (len >= 4) { - @branchHint(.unlikely); - blockCopy(dest, src, 4, len); - } else { - blockCopy(dest, src, 2, len); - } - return dest; - } - - if (len > 32) { - @branchHint(.unlikely); - if (len > 256) { - @branchHint(.unlikely); - copyMove(dest, src, len); - return dest; - } - copyLong(dest, src, len); - return dest; - } - - if (len > 16) { - @branchHint(.unlikely); - blockCopy(dest, src, 16, len); - return dest; - } - - blockCopy(dest, src, 8, len); - - return dest; -} - -inline fn blockCopy(dest: [*]u8, src: [*]const u8, block_size: comptime_int, len: usize) void { - const first = @as(*align(1) const @Vector(block_size, u8), src[0..block_size]).*; - const second = @as(*align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).*; - dest[0..block_size].* = first; - dest[len - block_size ..][0..block_size].* = second; -} - -inline fn copyLong(dest: [*]u8, src: [*]const u8, len: usize) void { - var array: [8]@Vector(32, u8) = undefined; - - inline for (.{ 64, 128, 192, 256 }, 0..) |N, i| { - array[i * 2] = src[(N / 2) - 32 ..][0..32].*; - array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*; - - if (len <= N) { - @branchHint(.unlikely); - for (0..i + 1) |j| { - dest[j * 32 ..][0..32].* = array[j * 2]; - dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1]; - } - return; - } - } -} - -inline fn copyMove(dest: [*]u8, src: [*]const u8, len: usize) void { - if (@intFromPtr(src) >= @intFromPtr(dest)) { - @branchHint(.unlikely); - copyForward(dest, src, len); - } else if (@intFromPtr(src) + len > @intFromPtr(dest)) { - @branchHint(.unlikely); - overlapBwd(dest, src, len); - } else { - copyForward(dest, src, len); - } -} - -inline fn copyForward(dest: [*]u8, src: [*]const u8, len: usize) void { - const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*; - - const N: usize = len & ~@as(usize, 127); - var i: usize = 0; - - while (i < N) : (i += 128) { - dest[i..][0..32].* = src[i..][0..32].*; - dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*; - dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*; - dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*; - } - - if (len - i <= 32) { - @branchHint(.unlikely); - dest[len - 32 ..][0..32].* = tail; - } else { - copyLong(dest[i..], src[i..], len - i); - } -} - -inline fn overlapBwd(dest: [*]u8, src: [*]const u8, len: usize) void { - var array: [5]@Vector(32, u8) = undefined; - array[0] = src[len - 32 ..][0..32].*; - inline for (1..5) |i| array[i] = src[(i - 1) << 5 ..][0..32].*; - - const end: usize = (@intFromPtr(dest) + len - 32) & 31; - const range = len - end; - var s = src + range; - var d = dest + range; - - while (@intFromPtr(s) > @intFromPtr(src + 128)) { - // zig fmt: off - const first = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 32)).*; - const second = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 64)).*; - const third = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 96)).*; - const fourth = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 128)).*; - - @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))).* = first; - @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))).* = second; - @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))).* = third; - @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))).* = fourth; - // zig fmt: on - - s -= 128; - d -= 128; - } - - inline for (array[1..], 0..) |vec, i| dest[i * 32 ..][0..32].* = vec; - dest[len - 32 ..][0..32].* = array[0]; +fn memcpy(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { + return @call(.always_inline, @import("memmove.zig").memmove, .{ opt_dest, opt_src, len }); } diff --git a/lib/compiler_rt/memmove.zig b/lib/compiler_rt/memmove.zig new file mode 100644 index 0000000000..f16abc0e7b --- /dev/null +++ b/lib/compiler_rt/memmove.zig @@ -0,0 +1,144 @@ +const std = @import("std"); +const common = @import("./common.zig"); +const builtin = @import("builtin"); + +comptime { + if (builtin.object_format != .c) { + @export(&memmove, .{ .name = "memmove", .linkage = common.linkage, .visibility = common.visibility }); + } +} + +pub fn memmove(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { + // a port of https://github.com/facebook/folly/blob/1c8bc50e88804e2a7361a57cd9b551dd10f6c5fd/folly/memcpy.S + if (len == 0) { + @branchHint(.unlikely); + return opt_dest; + } + + const dest = opt_dest.?; + const src = opt_src.?; + + if (len < 8) { + @branchHint(.unlikely); + if (len == 1) { + @branchHint(.unlikely); + dest[0] = src[0]; + } else if (len >= 4) { + @branchHint(.unlikely); + blockCopy(dest, src, 4, len); + } else { + blockCopy(dest, src, 2, len); + } + return dest; + } + + if (len > 32) { + @branchHint(.unlikely); + if (len > 256) { + @branchHint(.unlikely); + copyMove(dest, src, len); + return dest; + } + copyLong(dest, src, len); + return dest; + } + + if (len > 16) { + @branchHint(.unlikely); + blockCopy(dest, src, 16, len); + return dest; + } + + blockCopy(dest, src, 8, len); + + return dest; +} + +inline fn blockCopy(dest: [*]u8, src: [*]const u8, block_size: comptime_int, len: usize) void { + const first = @as(*align(1) const @Vector(block_size, u8), src[0..block_size]).*; + const second = @as(*align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).*; + dest[0..block_size].* = first; + dest[len - block_size ..][0..block_size].* = second; +} + +inline fn copyLong(dest: [*]u8, src: [*]const u8, len: usize) void { + var array: [8]@Vector(32, u8) = undefined; + + inline for (.{ 64, 128, 192, 256 }, 0..) |N, i| { + array[i * 2] = src[(N / 2) - 32 ..][0..32].*; + array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*; + + if (len <= N) { + @branchHint(.unlikely); + for (0..i + 1) |j| { + dest[j * 32 ..][0..32].* = array[j * 2]; + dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1]; + } + return; + } + } +} + +inline fn copyMove(dest: [*]u8, src: [*]const u8, len: usize) void { + if (@intFromPtr(src) >= @intFromPtr(dest)) { + @branchHint(.unlikely); + copyForward(dest, src, len); + } else if (@intFromPtr(src) + len > @intFromPtr(dest)) { + @branchHint(.unlikely); + overlapBwd(dest, src, len); + } else { + copyForward(dest, src, len); + } +} + +inline fn copyForward(dest: [*]u8, src: [*]const u8, len: usize) void { + const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*; + + const N: usize = len & ~@as(usize, 127); + var i: usize = 0; + + while (i < N) : (i += 128) { + dest[i..][0..32].* = src[i..][0..32].*; + dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*; + dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*; + dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*; + } + + if (len - i <= 32) { + @branchHint(.unlikely); + dest[len - 32 ..][0..32].* = tail; + } else { + copyLong(dest[i..], src[i..], len - i); + } +} + +inline fn overlapBwd(dest: [*]u8, src: [*]const u8, len: usize) void { + var array: [5]@Vector(32, u8) = undefined; + array[0] = src[len - 32 ..][0..32].*; + inline for (1..5) |i| array[i] = src[(i - 1) << 5 ..][0..32].*; + + const end: usize = (@intFromPtr(dest) + len - 32) & 31; + const range = len - end; + var s = src + range; + var d = dest + range; + + while (@intFromPtr(s) > @intFromPtr(src + 128)) { + // zig fmt: off + const first = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 32)).*; + const second = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 64)).*; + const third = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 96)).*; + const fourth = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 128)).*; + + @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))).* = first; + @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))).* = second; + @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))).* = third; + @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))).* = fourth; + // zig fmt: on + + s -= 128; + d -= 128; + } + + inline for (array[1..], 0..) |vec, i| dest[i * 32 ..][0..32].* = vec; + dest[len - 32 ..][0..32].* = array[0]; +} From d6e1166f1a93e8ed5dd0a74cc0b0daeae5c050a0 Mon Sep 17 00:00:00 2001 From: dweiller <4678790+dweiller@users.noreply.github.com> Date: Mon, 20 Jan 2025 17:27:50 +1100 Subject: [PATCH 2/3] compiler-rt: reduce memmove and memcpy size for ReleaseSmall --- lib/compiler_rt/memcpy.zig | 23 +++++++++++++++++++++-- lib/compiler_rt/memmove.zig | 30 ++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/lib/compiler_rt/memcpy.zig b/lib/compiler_rt/memcpy.zig index 82c41c101e..97188ac65f 100644 --- a/lib/compiler_rt/memcpy.zig +++ b/lib/compiler_rt/memcpy.zig @@ -4,10 +4,29 @@ const builtin = @import("builtin"); comptime { if (builtin.object_format != .c) { - @export(&memcpy, .{ .name = "memcpy", .linkage = common.linkage, .visibility = common.visibility }); + const export_options: std.builtin.ExportOptions = .{ + .name = "memcpy", + .linkage = common.linkage, + .visibility = common.visibility, + }; + + if (builtin.mode == .ReleaseSmall) + @export(&memcpySmall, export_options) + else + @export(&memcpyFast, export_options); } } -fn memcpy(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { +fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { + @setRuntimeSafety(builtin.is_test); + + for (0..len) |i| { + dest.?[i] = src.?[i]; + } + + return dest; +} + +fn memcpyFast(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { return @call(.always_inline, @import("memmove.zig").memmove, .{ opt_dest, opt_src, len }); } diff --git a/lib/compiler_rt/memmove.zig b/lib/compiler_rt/memmove.zig index f16abc0e7b..408151e2a3 100644 --- a/lib/compiler_rt/memmove.zig +++ b/lib/compiler_rt/memmove.zig @@ -4,11 +4,37 @@ const builtin = @import("builtin"); comptime { if (builtin.object_format != .c) { - @export(&memmove, .{ .name = "memmove", .linkage = common.linkage, .visibility = common.visibility }); + const export_options: std.builtin.ExportOptions = .{ + .name = "memmove", + .linkage = common.linkage, + .visibility = common.visibility, + }; + + if (builtin.mode == .ReleaseSmall) + @export(&memmoveSmall, export_options) + else + @export(&memmoveFast, export_options); } } -pub fn memmove(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { +fn memmoveSmall(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { + const dest = opt_dest.?; + const src = opt_src.?; + + if (@intFromPtr(dest) < @intFromPtr(src)) { + for (0..len) |i| { + dest[i] = src[i]; + } + } else { + for (0..len) |i| { + dest[len - 1 - i] = src[len - 1 - i]; + } + } + + return dest; +} + +pub fn memmoveFast(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { // a port of https://github.com/facebook/folly/blob/1c8bc50e88804e2a7361a57cd9b551dd10f6c5fd/folly/memcpy.S if (len == 0) { @branchHint(.unlikely); From b7a887f0fb7d166ad93eae62ecd93d1ed173297e Mon Sep 17 00:00:00 2001 From: dweiller <4678790+dweiller@users.noreply.github.com> Date: Mon, 20 Jan 2025 17:43:42 +1100 Subject: [PATCH 3/3] compiler-rt: optimize memcpy The new memcpy function aims to be more generic than the previous implementation which was adapted from an implementation optimized for x86_64 avx2 machines. Even on x86_64 avx2 machines this implementation should be generally be faster due to fewer branches in the small length cases and generating less machine code. Note that the new memcpy function no longer acts as a memmove. --- lib/compiler_rt/memcpy.zig | 185 ++++++++++++++++++++++++++++++++++++- 1 file changed, 183 insertions(+), 2 deletions(-) diff --git a/lib/compiler_rt/memcpy.zig b/lib/compiler_rt/memcpy.zig index 97188ac65f..d67a6e766a 100644 --- a/lib/compiler_rt/memcpy.zig +++ b/lib/compiler_rt/memcpy.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const assert = std.debug.assert; const common = @import("./common.zig"); const builtin = @import("builtin"); @@ -17,6 +18,19 @@ comptime { } } +const Element = if (std.simd.suggestVectorLength(u8)) |vec_size| + @Type(.{ .vector = .{ + .child = u8, + .len = vec_size, + } }) +else + usize; + +comptime { + assert(@sizeOf(Element) >= @alignOf(Element)); + assert(std.math.isPowerOfTwo(@sizeOf(Element))); +} + fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { @setRuntimeSafety(builtin.is_test); @@ -27,6 +41,173 @@ fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) call return dest; } -fn memcpyFast(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { - return @call(.always_inline, @import("memmove.zig").memmove, .{ opt_dest, opt_src, len }); +fn memcpyFast(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { + @setRuntimeSafety(builtin.is_test); + + const small_limit = 2 * @sizeOf(Element); + + if (copySmallLength(small_limit, dest.?, src.?, len)) return dest; + + copyForwards(dest.?, src.?, len); + + return dest; +} + +inline fn copySmallLength( + comptime small_limit: comptime_int, + dest: [*]u8, + src: [*]const u8, + len: usize, +) bool { + if (len < 16) { + copyLessThan16(dest, src, len); + return true; + } + + if (comptime 2 < (std.math.log2(small_limit) + 1) / 2) { + if (copy16ToSmallLimit(small_limit, dest, src, len)) return true; + } + + return false; +} + +inline fn copyLessThan16( + dest: [*]u8, + src: [*]const u8, + len: usize, +) void { + @setRuntimeSafety(builtin.is_test); + if (len < 4) { + if (len == 0) return; + dest[0] = src[0]; + dest[len / 2] = src[len / 2]; + dest[len - 1] = src[len - 1]; + return; + } + copyRange4(4, dest, src, len); +} + +inline fn copy16ToSmallLimit( + comptime small_limit: comptime_int, + dest: [*]u8, + src: [*]const u8, + len: usize, +) bool { + @setRuntimeSafety(builtin.is_test); + inline for (2..(std.math.log2(small_limit) + 1) / 2 + 1) |p| { + const limit = 1 << (2 * p); + if (len < limit) { + copyRange4(limit / 4, dest, src, len); + return true; + } + } + return false; +} + +inline fn copyForwards( + noalias dest: [*]u8, + noalias src: [*]const u8, + len: usize, +) void { + @setRuntimeSafety(builtin.is_test); + assert(len >= 2 * @sizeOf(Element)); + + dest[0..@sizeOf(Element)].* = src[0..@sizeOf(Element)].*; + const alignment_offset = @alignOf(Element) - @intFromPtr(src) % @alignOf(Element); + const n = len - alignment_offset; + const d = dest + alignment_offset; + const s = src + alignment_offset; + + copyBlocksAlignedSource(@ptrCast(d), @alignCast(@ptrCast(s)), n); + + // copy last `@sizeOf(Element)` bytes unconditionally, since block copy + // methods only copy a multiple of `@sizeOf(Element)` bytes. + const offset = len - @sizeOf(Element); + dest[offset..][0..@sizeOf(Element)].* = src[offset..][0..@sizeOf(Element)].*; +} + +inline fn copyBlocksAlignedSource( + noalias dest: [*]align(1) Element, + noalias src: [*]const Element, + max_bytes: usize, +) void { + copyBlocks(dest, src, max_bytes); +} + +/// Copies the largest multiple of `@sizeOf(T)` bytes from `src` to `dest`, +/// that is less than `max_bytes` where `T` is the child type of `src` and +/// `dest`. +inline fn copyBlocks( + noalias dest: anytype, + noalias src: anytype, + max_bytes: usize, +) void { + @setRuntimeSafety(builtin.is_test); + + const T = @typeInfo(@TypeOf(dest)).pointer.child; + comptime assert(T == @typeInfo(@TypeOf(src)).pointer.child); + + const loop_count = max_bytes / @sizeOf(T); + + for (dest[0..loop_count], src[0..loop_count]) |*d, s| { + d.* = s; + } +} + +/// copy `len` bytes from `src` to `dest`; `len` must be in the range +/// `[copy_len, 4 * copy_len)`. +inline fn copyRange4( + comptime copy_len: comptime_int, + noalias dest: [*]u8, + noalias src: [*]const u8, + len: usize, +) void { + @setRuntimeSafety(builtin.is_test); + comptime assert(std.math.isPowerOfTwo(copy_len)); + assert(len >= copy_len); + assert(len < 4 * copy_len); + + const a = len & (copy_len * 2); + const b = a / 2; + + const last = len - copy_len; + const pen = last - b; + + dest[0..copy_len].* = src[0..copy_len].*; + dest[b..][0..copy_len].* = src[b..][0..copy_len].*; + dest[pen..][0..copy_len].* = src[pen..][0..copy_len].*; + dest[last..][0..copy_len].* = src[last..][0..copy_len].*; +} + +test { + const S = struct { + fn testFunc(comptime copy_func: anytype) !void { + const max_len = 1024; + var buffer: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined; + for (&buffer, 0..) |*b, i| { + b.* = @intCast(i % 97); + } + var dest: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined; + + for (0..max_len) |copy_len| { + for (0..@alignOf(Element)) |s_offset| { + for (0..@alignOf(Element)) |d_offset| { + @memset(&dest, 0xff); + const s = buffer[s_offset..][0..copy_len]; + const d = dest[d_offset..][0..copy_len]; + _ = copy_func(@ptrCast(d.ptr), @ptrCast(s.ptr), s.len); + std.testing.expectEqualSlices(u8, s, d) catch |e| { + std.debug.print("error encountered for length={d}, s_offset={d}, d_offset={d}\n", .{ + copy_len, s_offset, d_offset, + }); + return e; + }; + } + } + } + } + }; + + try S.testFunc(memcpySmall); + try S.testFunc(memcpyFast); }