From eafdc5562f5053ecc193041e83d3661ef0744ebb Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Wed, 16 Mar 2022 21:06:02 +0100
Subject: [PATCH 1/2] wasm: Implement 'memcpy' instruction

This implements the `memcpy` instruction and also updates the inline memcpy calls
to make use of the same implementation. We use the fast-loop when the length is comptime known,
and use a runtime loop when the length is runtime known.
We also perform feature-dection to emit a simply wasm memory.copy instruction when the feature
'bulk-memory' is enabled. (off by default).
---
 src/arch/wasm/CodeGen.zig | 160 ++++++++++++++++++++++++++++++++------
 test/behavior/basic.zig   |   1 -
 2 files changed, 138 insertions(+), 23 deletions(-)

diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index 33fb35163a..0835828356 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -895,7 +895,7 @@ fn genFunc(self: *Self) InnerError!void {
         try prologue.append(.{ .tag = .i32_sub, .data = .{ .tag = {} } });
         // Get negative stack aligment
         try prologue.append(.{ .tag = .i32_const, .data = .{ .imm32 = @intCast(i32, self.stack_alignment) * -1 } });
-        // Bit and the value to get the new stack pointer to ensure the pointers are aligned with the abi alignment
+        // Bitwise-and the value to get the new stack pointer to ensure the pointers are aligned with the abi alignment
         try prologue.append(.{ .tag = .i32_and, .data = .{ .tag = {} } });
         // store the current stack pointer as the bottom, which will be used to calculate all stack pointer offsets
         try prologue.append(.{ .tag = .local_tee, .data = .{ .label = self.bottom_stack_value.local } });
@@ -1074,22 +1074,123 @@ fn toWasmBits(bits: u16) ?u16 {
 
 /// Performs a copy of bytes for a given type. Copying all bytes
 /// from rhs to lhs.
-///
-/// TODO: Perform feature detection and when bulk_memory is available,
-/// use wasm's mem.copy instruction.
-fn memCopy(self: *Self, ty: Type, lhs: WValue, rhs: WValue) !void {
-    const abi_size = ty.abiSize(self.target);
-    var offset: u32 = 0;
-    const lhs_base = lhs.offset();
-    const rhs_base = rhs.offset();
-    while (offset < abi_size) : (offset += 1) {
-        // get lhs' address to store the result
-        try self.emitWValue(lhs);
-        // load byte from rhs' adress
-        try self.emitWValue(rhs);
-        try self.addMemArg(.i32_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 });
-        // store the result in lhs (we already have its address on the stack)
-        try self.addMemArg(.i32_store8, .{ .offset = lhs_base + offset, .alignment = 1 });
+fn memcpy(self: *Self, dst: WValue, src: WValue, len: WValue) !void {
+    // When bulk_memory is enabled, we lower it to wasm's memcpy instruction.
+    // If not, we lower it ourselves manually
+    if (std.Target.wasm.featureSetHas(self.target.cpu.features, .bulk_memory)) {
+        switch (dst) {
+            .stack_offset => try self.emitWValue(try self.buildPointerOffset(dst, 0, .new)),
+            else => try self.emitWValue(dst),
+        }
+        switch (src) {
+            .stack_offset => try self.emitWValue(try self.buildPointerOffset(src, 0, .new)),
+            else => try self.emitWValue(src),
+        }
+        try self.emitWValue(len);
+        try self.addExtended(.memory_copy);
+        return;
+    }
+
+    // when the length is comptime-known, rather than a runtime value, we can optimize the generated code by having
+    // the loop during codegen, rather than inserting a runtime loop into the binary.
+    switch (len) {
+        .imm32, .imm64 => {
+            const length = switch (len) {
+                .imm32 => |val| val,
+                .imm64 => |val| val,
+                else => unreachable,
+            };
+            var offset: u32 = 0;
+            const lhs_base = dst.offset();
+            const rhs_base = src.offset();
+            while (offset < length) : (offset += 1) {
+                // get dst's address to store the result
+                try self.emitWValue(dst);
+                // load byte from src's address
+                try self.emitWValue(src);
+                switch (self.arch()) {
+                    .wasm32 => {
+                        try self.addMemArg(.i32_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 });
+                        try self.addMemArg(.i32_store8, .{ .offset = lhs_base + offset, .alignment = 1 });
+                    },
+                    .wasm64 => {
+                        try self.addMemArg(.i64_load8_u, .{ .offset = rhs_base + offset, .alignment = 1 });
+                        try self.addMemArg(.i64_store8, .{ .offset = lhs_base + offset, .alignment = 1 });
+                    },
+                    else => unreachable,
+                }
+            }
+        },
+        else => {
+            // TODO: We should probably lower this to a call to compiler_rt
+            // But for now, we implement it manually
+            const offset = try self.allocLocal(Type.usize); // local for counter
+            // outer block to jump to when loop is done
+            try self.startBlock(.block, wasm.block_empty);
+            try self.startBlock(.loop, wasm.block_empty);
+
+            // loop condition (offset == length -> break)
+            {
+                try self.emitWValue(offset);
+                try self.emitWValue(len);
+                switch (self.arch()) {
+                    .wasm32 => try self.addTag(.i32_eq),
+                    .wasm64 => try self.addTag(.i64_eq),
+                    else => unreachable,
+                }
+                try self.addLabel(.br_if, 1); // jump out of loop into outer block (finished)
+            }
+
+            // get dst ptr
+            {
+                try self.emitWValue(dst);
+                try self.emitWValue(offset);
+                switch (self.arch()) {
+                    .wasm32 => try self.addTag(.i32_add),
+                    .wasm64 => try self.addTag(.i64_add),
+                    else => unreachable,
+                }
+            }
+
+            // get src value and also store in dst
+            {
+                try self.emitWValue(src);
+                try self.emitWValue(offset);
+                switch (self.arch()) {
+                    .wasm32 => {
+                        try self.addTag(.i32_add);
+                        try self.addMemArg(.i32_load8_u, .{ .offset = src.offset(), .alignment = 1 });
+                        try self.addMemArg(.i32_store8, .{ .offset = dst.offset(), .alignment = 1 });
+                    },
+                    .wasm64 => {
+                        try self.addTag(.i64_add);
+                        try self.addMemArg(.i64_load8_u, .{ .offset = src.offset(), .alignment = 1 });
+                        try self.addMemArg(.i64_store8, .{ .offset = dst.offset(), .alignment = 1 });
+                    },
+                    else => unreachable,
+                }
+            }
+
+            // increment loop counter
+            {
+                try self.emitWValue(offset);
+                switch (self.arch()) {
+                    .wasm32 => {
+                        try self.addImm32(1);
+                        try self.addTag(.i32_add);
+                    },
+                    .wasm64 => {
+                        try self.addImm64(1);
+                        try self.addTag(.i64_add);
+                    },
+                    else => unreachable,
+                }
+                try self.addLabel(.local_set, offset.local);
+                try self.addLabel(.br, 0); // jump to start of loop
+            }
+            try self.endBlock(); // close off loop block
+            try self.endBlock(); // close off outer block
+        },
     }
 }
 
@@ -1297,6 +1398,8 @@ fn genInst(self: *Self, inst: Air.Inst.Index) !WValue {
         .wasm_memory_size => self.airWasmMemorySize(inst),
         .wasm_memory_grow => self.airWasmMemoryGrow(inst),
 
+        .memcpy => self.airMemcpy(inst),
+
         .add_sat,
         .sub_sat,
         .mul_sat,
@@ -1337,7 +1440,6 @@ fn genInst(self: *Self, inst: Air.Inst.Index) !WValue {
         .ptr_slice_len_ptr,
         .ptr_slice_ptr_ptr,
         .int_to_float,
-        .memcpy,
         .cmpxchg_weak,
         .cmpxchg_strong,
         .fence,
@@ -1519,7 +1621,8 @@ fn store(self: *Self, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErro
                 return self.store(lhs, rhs, err_ty, 0);
             }
 
-            return self.memCopy(ty, lhs, rhs);
+            const len = @intCast(u32, ty.abiSize(self.target));
+            return self.memcpy(lhs, rhs, .{ .imm32 = len });
         },
         .Optional => {
             if (ty.isPtrLikeOptional()) {
@@ -1531,10 +1634,12 @@ fn store(self: *Self, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErro
                 return self.store(lhs, rhs, Type.u8, 0);
             }
 
-            return self.memCopy(ty, lhs, rhs);
+            const len = @intCast(u32, ty.abiSize(self.target));
+            return self.memcpy(lhs, rhs, .{ .imm32 = len });
         },
         .Struct, .Array, .Union, .Vector => {
-            return self.memCopy(ty, lhs, rhs);
+            const len = @intCast(u32, ty.abiSize(self.target));
+            return self.memcpy(lhs, rhs, .{ .imm32 = len });
         },
         .Pointer => {
             if (ty.isSlice()) {
@@ -1549,7 +1654,8 @@ fn store(self: *Self, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerErro
             }
         },
         .Int => if (ty.intInfo(self.target).bits > 64) {
-            return self.memCopy(ty, lhs, rhs);
+            const len = @intCast(u32, ty.abiSize(self.target));
+            return self.memcpy(lhs, rhs, .{ .imm32 = len });
         },
         else => {},
     }
@@ -3300,3 +3406,13 @@ fn airFieldParentPtr(self: *Self, inst: Air.Inst.Index) InnerError!WValue {
     try self.addLabel(.local_set, base.local);
     return base;
 }
+
+fn airMemcpy(self: *Self, inst: Air.Inst.Index) InnerError!WValue {
+    const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+    const bin_op = self.air.extraData(Air.Bin, pl_op.payload).data;
+    const dst = try self.resolveInst(pl_op.operand);
+    const src = try self.resolveInst(bin_op.lhs);
+    const len = try self.resolveInst(bin_op.rhs);
+    try self.memcpy(dst, src, len);
+    return WValue{ .none = {} };
+}
diff --git a/test/behavior/basic.zig b/test/behavior/basic.zig
index f22e93008c..bb3232c01c 100644
--- a/test/behavior/basic.zig
+++ b/test/behavior/basic.zig
@@ -340,7 +340,6 @@ fn f2(x: bool) []const u8 {
 test "memcpy and memset intrinsics" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
 
     try testMemcpyMemset();
     // TODO add comptime test coverage

From 215a22541c2a5b9886173dca86e2990ae5c649d1 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Wed, 16 Mar 2022 22:08:25 +0100
Subject: [PATCH 2/2] wasm: Improve memset implementation

When the length is comptime-known, we perform an inline loop instead of emitting
a runtime loop into the binary.

This also allows us to easily write 'undefined' to aggregate types.
We now do this when we set the error tag of an error union where the payload will be set to undefined.
---
 src/arch/wasm/CodeGen.zig | 119 +++++++++++++++++++++++++-------------
 1 file changed, 78 insertions(+), 41 deletions(-)

diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index 0835828356..141acaea63 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -2519,8 +2519,16 @@ fn airWrapErrUnionErr(self: *Self, inst: Air.Inst.Index) InnerError!WValue {
     if (!err_ty.errorUnionPayload().hasRuntimeBits()) return operand;
 
     const err_union = try self.allocStack(err_ty);
-    // TODO: Also write 'undefined' to the payload
     try self.store(err_union, operand, err_ty.errorUnionSet(), 0);
+
+    // write 'undefined' to the payload
+    const err_align = err_ty.abiAlignment(self.target);
+    const set_size = err_ty.errorUnionSet().abiSize(self.target);
+    const offset = mem.alignForwardGeneric(u64, set_size, err_align);
+    const payload_ptr = try self.buildPointerOffset(err_union, offset, .new);
+    const len = @intCast(u32, err_ty.errorUnionPayload().abiSize(self.target));
+    try self.memset(payload_ptr, .{ .imm32 = len }, .{ .imm32 = 0xaaaaaaaa });
+
     return err_union;
 }
 
@@ -2972,7 +2980,7 @@ fn airMemset(self: *Self, inst: Air.Inst.Index) InnerError!WValue {
     const ptr = try self.resolveInst(pl_op.operand);
     const value = try self.resolveInst(bin_op.lhs);
     const len = try self.resolveInst(bin_op.rhs);
-    try self.memSet(ptr, len, value);
+    try self.memset(ptr, len, value);
 
     return WValue{ .none = {} };
 }
@@ -2981,7 +2989,7 @@ fn airMemset(self: *Self, inst: Air.Inst.Index) InnerError!WValue {
 /// When the user has enabled the bulk_memory feature, we lower
 /// this to wasm's memset instruction. When the feature is not present,
 /// we implement it manually.
-fn memSet(self: *Self, ptr: WValue, len: WValue, value: WValue) InnerError!void {
+fn memset(self: *Self, ptr: WValue, len: WValue, value: WValue) InnerError!void {
     // When bulk_memory is enabled, we lower it to wasm's memset instruction.
     // If not, we lower it ourselves
     if (std.Target.wasm.featureSetHas(self.target.cpu.features, .bulk_memory)) {
@@ -2995,45 +3003,74 @@ fn memSet(self: *Self, ptr: WValue, len: WValue, value: WValue) InnerError!void
         return;
     }
 
-    // TODO: We should probably lower this to a call to compiler_rt
-    // But for now, we implement it manually
-    const offset = try self.allocLocal(Type.usize); // local for counter
-    // outer block to jump to when loop is done
-    try self.startBlock(.block, wasm.block_empty);
-    try self.startBlock(.loop, wasm.block_empty);
-    try self.emitWValue(offset);
-    try self.emitWValue(len);
-    switch (self.ptrSize()) {
-        4 => try self.addTag(.i32_eq),
-        8 => try self.addTag(.i64_eq),
-        else => unreachable,
+    // When the length is comptime-known we do the loop at codegen, rather
+    // than emitting a runtime loop into the binary
+    switch (len) {
+        .imm32, .imm64 => {
+            const length = switch (len) {
+                .imm32 => |val| val,
+                .imm64 => |val| val,
+                else => unreachable,
+            };
+
+            var offset: u32 = 0;
+            const base = ptr.offset();
+            while (offset < length) : (offset += 1) {
+                try self.emitWValue(ptr);
+                try self.emitWValue(value);
+                switch (self.arch()) {
+                    .wasm32 => {
+                        try self.addMemArg(.i32_store8, .{ .offset = base + offset, .alignment = 1 });
+                    },
+                    .wasm64 => {
+                        try self.addMemArg(.i64_store8, .{ .offset = base + offset, .alignment = 1 });
+                    },
+                    else => unreachable,
+                }
+            }
+        },
+        else => {
+            // TODO: We should probably lower this to a call to compiler_rt
+            // But for now, we implement it manually
+            const offset = try self.allocLocal(Type.usize); // local for counter
+            // outer block to jump to when loop is done
+            try self.startBlock(.block, wasm.block_empty);
+            try self.startBlock(.loop, wasm.block_empty);
+            try self.emitWValue(offset);
+            try self.emitWValue(len);
+            switch (self.arch()) {
+                .wasm32 => try self.addTag(.i32_eq),
+                .wasm64 => try self.addTag(.i64_eq),
+                else => unreachable,
+            }
+            try self.addLabel(.br_if, 1); // jump out of loop into outer block (finished)
+            try self.emitWValue(ptr);
+            try self.emitWValue(offset);
+            switch (self.arch()) {
+                .wasm32 => try self.addTag(.i32_add),
+                .wasm64 => try self.addTag(.i64_add),
+                else => unreachable,
+            }
+            try self.emitWValue(value);
+            const mem_store_op: Mir.Inst.Tag = switch (self.arch()) {
+                .wasm32 => .i32_store8,
+                .wasm64 => .i64_store8,
+                else => unreachable,
+            };
+            try self.addMemArg(mem_store_op, .{ .offset = ptr.offset(), .alignment = 1 });
+            try self.emitWValue(offset);
+            try self.addImm32(1);
+            switch (self.arch()) {
+                .wasm32 => try self.addTag(.i32_add),
+                .wasm64 => try self.addTag(.i64_add),
+                else => unreachable,
+            }
+            try self.addLabel(.local_set, offset.local);
+            try self.addLabel(.br, 0); // jump to start of loop
+            try self.endBlock();
+            try self.endBlock();
+        },
     }
-    try self.addLabel(.br_if, 1); // jump out of loop into outer block (finished)
-    try self.emitWValue(ptr);
-    try self.emitWValue(offset);
-    switch (self.arch()) {
-        .wasm32 => try self.addTag(.i32_add),
-        .wasm64 => try self.addTag(.i64_add),
-        else => unreachable,
-    }
-    try self.emitWValue(value);
-    const mem_store_op: Mir.Inst.Tag = switch (self.arch()) {
-        .wasm32 => .i32_store8,
-        .wasm64 => .i64_store8,
-        else => unreachable,
-    };
-    try self.addMemArg(mem_store_op, .{ .offset = ptr.offset(), .alignment = 1 });
-    try self.emitWValue(offset);
-    try self.addImm32(1);
-    switch (self.ptrSize()) {
-        4 => try self.addTag(.i32_add),
-        8 => try self.addTag(.i64_add),
-        else => unreachable,
-    }
-    try self.addLabel(.local_set, offset.local);
-    try self.addLabel(.br, 0); // jump to start of loop
-    try self.endBlock();
-    try self.endBlock();
 }
 
 fn airArrayElemVal(self: *Self, inst: Air.Inst.Index) InnerError!WValue {