From eafdc5562f5053ecc193041e83d3661ef0744ebb Mon Sep 17 00:00:00 2001 From: Luuk de Gram Date: Wed, 16 Mar 2022 21:06:02 +0100 Subject: [PATCH 1/2] wasm: Implement 'memcpy' instruction This implements the `memcpy` instruction and also updates the inline memcpy calls to make use of the same implementation. We use the fast-loop when the length is comptime known, and use a runtime loop when the length is runtime known. We also perform feature-dection to emit a simply wasm memory.copy instruction when the feature 'bulk-memory' is enabled. (off by default). --- src/arch/wasm/CodeGen.zig | 160 ++++++++++++++++++++++++++++++++------ test/behavior/basic.zig | 1 - 2 files changed, 138 insertions(+), 23 deletions(-) diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig index 33fb35163a..0835828356 100644 --- a/src/arch/wasm/CodeGen.zig +++ b/src/arch/wasm/CodeGen.zig @@ -895,7 +895,7 @@ fn genFunc(self: *Self) InnerError!void { try prologue.append(.{ .tag = .i32_sub, .data = .{ .tag = {} } }); // Get negative stack aligment try prologue.append(.{ .tag = .i32_const, .data = .{ .imm32 = @intCast(i32, self.stack_alignment) * -1 } }); - // Bit and the value to get the new stack pointer to ensure the pointers are aligned with the abi alignment + // Bitwise-and the value to get the new stack pointer to ensure the pointers are aligned with the abi alignment try prologue.append(.{ .tag = .i32_and, .data = .{ .tag = {} } }); // store the current stack pointer as the bottom, which will be used to calculate all stack pointer offsets try prologue.append(.{ .tag = .local_tee, .data = .{ .label = self.bottom_stack_value.local } }); @@ -1074,22 +1074,123 @@ fn toWasmBits(bits: u16) ?u16 { /// Performs a copy of bytes for a given type. Copying all bytes /// from rhs to lhs. -/// -/// TODO: Perform feature detection and when bulk_memory is available, -/// use wasm's mem.copy instruction. -fn memCopy(self: *Self, ty: Type, lhs: WValue, rhs: WValue) !void { - const abi_size = ty.abiSize(self.target); - var offset: u32 = 0; - const lhs_base = lhs.offset(); - const rhs_base = rhs.offset(); - while (offset < abi_size) : (offset += 1) { - // get lhs' address to store the result - try self.emitWValue(lhs); - // load byte from rhs' adress - try self.emitWValue(rhs); - try self.addMemArg(.i32_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 }); - // store the result in lhs (we already have its address on the stack) - try self.addMemArg(.i32_store8, .{ .offset = lhs_base + offset, .alignment = 1 }); +fn memcpy(self: *Self, dst: WValue, src: WValue, len: WValue) !void { + // When bulk_memory is enabled, we lower it to wasm's memcpy instruction. + // If not, we lower it ourselves manually + if (std.Target.wasm.featureSetHas(self.target.cpu.features, .bulk_memory)) { + switch (dst) { + .stack_offset => try self.emitWValue(try self.buildPointerOffset(dst, 0, .new)), + else => try self.emitWValue(dst), + } + switch (src) { + .stack_offset => try self.emitWValue(try self.buildPointerOffset(src, 0, .new)), + else => try self.emitWValue(src), + } + try self.emitWValue(len); + try self.addExtended(.memory_copy); + return; + } + + // when the length is comptime-known, rather than a runtime value, we can optimize the generated code by having + // the loop during codegen, rather than inserting a runtime loop into the binary. + switch (len) { + .imm32, .imm64 => { + const length = switch (len) { + .imm32 => |val| val, + .imm64 => |val| val, + else => unreachable, + }; + var offset: u32 = 0; + const lhs_base = dst.offset(); + const rhs_base = src.offset(); + while (offset < length) : (offset += 1) { + // get dst's address to store the result + try self.emitWValue(dst); + // load byte from src's address + try self.emitWValue(src); + switch (self.arch()) { + .wasm32 => { + try self.addMemArg(.i32_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 }); + try self.addMemArg(.i32_store8, .{ .offset = lhs_base + offset, .alignment = 1 }); + }, + .wasm64 => { + try self.addMemArg(.i64_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 }); + try self.addMemArg(.i64_store8, .{ .offset = lhs_base + offset, .alignment = 1 }); + }, + else => unreachable, + } + } + }, + else => { + // TODO: We should probably lower this to a call to compiler_rt + // But for now, we implement it manually + const offset = try self.allocLocal(Type.usize); // local for counter + // outer block to jump to when loop is done + try self.startBlock(.block, wasm.block_empty); + try self.startBlock(.loop, wasm.block_empty); + + // loop condition (offset == length -> break) + { + try self.emitWValue(offset); + try self.emitWValue(len); + switch (self.arch()) { + .wasm32 => try self.addTag(.i32_eq), + .wasm64 => try self.addTag(.i64_eq), + else => unreachable, + } + try self.addLabel(.br_if, 1); // jump out of loop into outer block (finished) + } + + // get dst ptr + { + try self.emitWValue(dst); + try self.emitWValue(offset); + switch (self.arch()) { + .wasm32 => try self.addTag(.i32_add), + .wasm64 => try self.addTag(.i64_add), + else => unreachable, + } + } + + // get src value and also store in dst + { + try self.emitWValue(src); + try self.emitWValue(offset); + switch (self.arch()) { + .wasm32 => { + try self.addTag(.i32_add); + try self.addMemArg(.i32_load8_u, .{ .offset = src.offset(), .alignment = 1 }); + try self.addMemArg(.i32_store8, .{ .offset = dst.offset(), .alignment = 1 }); + }, + .wasm64 => { + try self.addTag(.i64_add); + try self.addMemArg(.i64_load8_u, .{ .offset = src.offset(), .alignment = 1 }); + try self.addMemArg(.i64_store8, .{ .offset = dst.offset(), .alignment = 1 }); + }, + else => unreachable, + } + } + + // increment loop counter + { + try self.emitWValue(offset); + switch (self.arch()) { + .wasm32 => { + try self.addImm32(1); + try self.addTag(.i32_add); + }, + .wasm64 => { + try self.addImm64(1); + try self.addTag(.i64_add); + }, + else => unreachable, + } + try self.addLabel(.local_set, offset.local); + try self.addLabel(.br, 0); // jump to start of loop + } + try self.endBlock(); // close off loop block + try self.endBlock(); // close off outer block + }, } } @@ -1297,6 +1398,8 @@ fn genInst(self: *Self, inst: Air.Inst.Index) !WValue { .wasm_memory_size => self.airWasmMemorySize(inst), .wasm_memory_grow => self.airWasmMemoryGrow(inst), + .memcpy => self.airMemcpy(inst), + .add_sat, .sub_sat, .mul_sat, @@ -1337,7 +1440,6 @@ fn genInst(self: *Self, inst: Air.Inst.Index) !WValue { .ptr_slice_len_ptr, .ptr_slice_ptr_ptr, .int_to_float, - .memcpy, .cmpxchg_weak, .cmpxchg_strong, .fence, @@ -1519,7 +1621,8 @@ fn store(self: *Self, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErro return self.store(lhs, rhs, err_ty, 0); } - return self.memCopy(ty, lhs, rhs); + const len = @intCast(u32, ty.abiSize(self.target)); + return self.memcpy(lhs, rhs, .{ .imm32 = len }); }, .Optional => { if (ty.isPtrLikeOptional()) { @@ -1531,10 +1634,12 @@ fn store(self: *Self, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErro return self.store(lhs, rhs, Type.u8, 0); } - return self.memCopy(ty, lhs, rhs); + const len = @intCast(u32, ty.abiSize(self.target)); + return self.memcpy(lhs, rhs, .{ .imm32 = len }); }, .Struct, .Array, .Union, .Vector => { - return self.memCopy(ty, lhs, rhs); + const len = @intCast(u32, ty.abiSize(self.target)); + return self.memcpy(lhs, rhs, .{ .imm32 = len }); }, .Pointer => { if (ty.isSlice()) { @@ -1549,7 +1654,8 @@ fn store(self: *Self, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErro } }, .Int => if (ty.intInfo(self.target).bits > 64) { - return self.memCopy(ty, lhs, rhs); + const len = @intCast(u32, ty.abiSize(self.target)); + return self.memcpy(lhs, rhs, .{ .imm32 = len }); }, else => {}, } @@ -3300,3 +3406,13 @@ fn airFieldParentPtr(self: *Self, inst: Air.Inst.Index) InnerError!WValue { try self.addLabel(.local_set, base.local); return base; } + +fn airMemcpy(self: *Self, inst: Air.Inst.Index) InnerError!WValue { + const pl_op = self.air.instructions.items(.data)[inst].pl_op; + const bin_op = self.air.extraData(Air.Bin, pl_op.payload).data; + const dst = try self.resolveInst(pl_op.operand); + const src = try self.resolveInst(bin_op.lhs); + const len = try self.resolveInst(bin_op.rhs); + try self.memcpy(dst, src, len); + return WValue{ .none = {} }; +} diff --git a/test/behavior/basic.zig b/test/behavior/basic.zig index f22e93008c..bb3232c01c 100644 --- a/test/behavior/basic.zig +++ b/test/behavior/basic.zig @@ -340,7 +340,6 @@ fn f2(x: bool) []const u8 { test "memcpy and memset intrinsics" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; - if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO try testMemcpyMemset(); // TODO add comptime test coverage From 215a22541c2a5b9886173dca86e2990ae5c649d1 Mon Sep 17 00:00:00 2001 From: Luuk de Gram Date: Wed, 16 Mar 2022 22:08:25 +0100 Subject: [PATCH 2/2] wasm: Improve memset implementation When the length is comptime-known, we perform an inline loop instead of emitting a runtime loop into the binary. This also allows us to easily write 'undefined' to aggregate types. We now do this when we set the error tag of an error union where the payload will be set to undefined. --- src/arch/wasm/CodeGen.zig | 119 +++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 41 deletions(-) diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig index 0835828356..141acaea63 100644 --- a/src/arch/wasm/CodeGen.zig +++ b/src/arch/wasm/CodeGen.zig @@ -2519,8 +2519,16 @@ fn airWrapErrUnionErr(self: *Self, inst: Air.Inst.Index) InnerError!WValue { if (!err_ty.errorUnionPayload().hasRuntimeBits()) return operand; const err_union = try self.allocStack(err_ty); - // TODO: Also write 'undefined' to the payload try self.store(err_union, operand, err_ty.errorUnionSet(), 0); + + // write 'undefined' to the payload + const err_align = err_ty.abiAlignment(self.target); + const set_size = err_ty.errorUnionSet().abiSize(self.target); + const offset = mem.alignForwardGeneric(u64, set_size, err_align); + const payload_ptr = try self.buildPointerOffset(err_union, offset, .new); + const len = @intCast(u32, err_ty.errorUnionPayload().abiSize(self.target)); + try self.memset(payload_ptr, .{ .imm32 = len }, .{ .imm32 = 0xaaaaaaaa }); + return err_union; } @@ -2972,7 +2980,7 @@ fn airMemset(self: *Self, inst: Air.Inst.Index) InnerError!WValue { const ptr = try self.resolveInst(pl_op.operand); const value = try self.resolveInst(bin_op.lhs); const len = try self.resolveInst(bin_op.rhs); - try self.memSet(ptr, len, value); + try self.memset(ptr, len, value); return WValue{ .none = {} }; } @@ -2981,7 +2989,7 @@ fn airMemset(self: *Self, inst: Air.Inst.Index) InnerError!WValue { /// When the user has enabled the bulk_memory feature, we lower /// this to wasm's memset instruction. When the feature is not present, /// we implement it manually. -fn memSet(self: *Self, ptr: WValue, len: WValue, value: WValue) InnerError!void { +fn memset(self: *Self, ptr: WValue, len: WValue, value: WValue) InnerError!void { // When bulk_memory is enabled, we lower it to wasm's memset instruction. // If not, we lower it ourselves if (std.Target.wasm.featureSetHas(self.target.cpu.features, .bulk_memory)) { @@ -2995,45 +3003,74 @@ fn memSet(self: *Self, ptr: WValue, len: WValue, value: WValue) InnerError!void return; } - // TODO: We should probably lower this to a call to compiler_rt - // But for now, we implement it manually - const offset = try self.allocLocal(Type.usize); // local for counter - // outer block to jump to when loop is done - try self.startBlock(.block, wasm.block_empty); - try self.startBlock(.loop, wasm.block_empty); - try self.emitWValue(offset); - try self.emitWValue(len); - switch (self.ptrSize()) { - 4 => try self.addTag(.i32_eq), - 8 => try self.addTag(.i64_eq), - else => unreachable, + // When the length is comptime-known we do the loop at codegen, rather + // than emitting a runtime loop into the binary + switch (len) { + .imm32, .imm64 => { + const length = switch (len) { + .imm32 => |val| val, + .imm64 => |val| val, + else => unreachable, + }; + + var offset: u32 = 0; + const base = ptr.offset(); + while (offset < length) : (offset += 1) { + try self.emitWValue(ptr); + try self.emitWValue(value); + switch (self.arch()) { + .wasm32 => { + try self.addMemArg(.i32_store8, .{ .offset = base + offset, .alignment = 1 }); + }, + .wasm64 => { + try self.addMemArg(.i64_store8, .{ .offset = base + offset, .alignment = 1 }); + }, + else => unreachable, + } + } + }, + else => { + // TODO: We should probably lower this to a call to compiler_rt + // But for now, we implement it manually + const offset = try self.allocLocal(Type.usize); // local for counter + // outer block to jump to when loop is done + try self.startBlock(.block, wasm.block_empty); + try self.startBlock(.loop, wasm.block_empty); + try self.emitWValue(offset); + try self.emitWValue(len); + switch (self.arch()) { + .wasm32 => try self.addTag(.i32_eq), + .wasm64 => try self.addTag(.i64_eq), + else => unreachable, + } + try self.addLabel(.br_if, 1); // jump out of loop into outer block (finished) + try self.emitWValue(ptr); + try self.emitWValue(offset); + switch (self.arch()) { + .wasm32 => try self.addTag(.i32_add), + .wasm64 => try self.addTag(.i64_add), + else => unreachable, + } + try self.emitWValue(value); + const mem_store_op: Mir.Inst.Tag = switch (self.arch()) { + .wasm32 => .i32_store8, + .wasm64 => .i64_store8, + else => unreachable, + }; + try self.addMemArg(mem_store_op, .{ .offset = ptr.offset(), .alignment = 1 }); + try self.emitWValue(offset); + try self.addImm32(1); + switch (self.arch()) { + .wasm32 => try self.addTag(.i32_add), + .wasm64 => try self.addTag(.i64_add), + else => unreachable, + } + try self.addLabel(.local_set, offset.local); + try self.addLabel(.br, 0); // jump to start of loop + try self.endBlock(); + try self.endBlock(); + }, } - try self.addLabel(.br_if, 1); // jump out of loop into outer block (finished) - try self.emitWValue(ptr); - try self.emitWValue(offset); - switch (self.arch()) { - .wasm32 => try self.addTag(.i32_add), - .wasm64 => try self.addTag(.i64_add), - else => unreachable, - } - try self.emitWValue(value); - const mem_store_op: Mir.Inst.Tag = switch (self.arch()) { - .wasm32 => .i32_store8, - .wasm64 => .i64_store8, - else => unreachable, - }; - try self.addMemArg(mem_store_op, .{ .offset = ptr.offset(), .alignment = 1 }); - try self.emitWValue(offset); - try self.addImm32(1); - switch (self.ptrSize()) { - 4 => try self.addTag(.i32_add), - 8 => try self.addTag(.i64_add), - else => unreachable, - } - try self.addLabel(.local_set, offset.local); - try self.addLabel(.br, 0); // jump to start of loop - try self.endBlock(); - try self.endBlock(); } fn airArrayElemVal(self: *Self, inst: Air.Inst.Index) InnerError!WValue {