From 1bbfa36b76271e907cac88e83cec8dee1e3d69f7 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Wed, 29 Jul 2020 00:08:43 -0700 Subject: [PATCH] stage2: improved codegen * multiple returns jump to one canonical function exitlude. This is in preparation for the defer feature. * simple elision of trivial jump relocs. * omit prelude/exitlude for naked calling convention functions. * fix not switching on arch for prelude/exitlude * fix swapped registers when setting stack mem from a register --- src-self-hosted/codegen.zig | 114 ++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 26 deletions(-) diff --git a/src-self-hosted/codegen.zig b/src-self-hosted/codegen.zig index 40fb6c5407..6880d6dbf3 100644 --- a/src-self-hosted/codegen.zig +++ b/src-self-hosted/codegen.zig @@ -214,6 +214,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { src: usize, stack_align: u32, + /// The value is an offset into the `Function` `code` from the beginning. + /// To perform the reloc, write 32-bit signed little-endian integer + /// which is a relative jump, based on the address following the reloc. + exitlude_jump_relocs: std.ArrayListUnmanaged(usize) = .{}, + /// Whenever there is a runtime branch, we push a Branch onto this stack, /// and pop it off when the runtime branch joins. This provides an "overlay" /// of the table of mappings from instructions to `MCValue` from within the branch. @@ -376,6 +381,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { .src = src, .stack_align = undefined, }; + defer function.exitlude_jump_relocs.deinit(bin_file.allocator); var call_info = function.resolveCallingConventionValues(src, fn_type) catch |err| switch (err) { error.CodegenFail => return Result{ .fail = function.err_msg.? }, @@ -401,29 +407,78 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { } fn gen(self: *Self) !void { - try self.code.ensureCapacity(self.code.items.len + 11); + switch (arch) { + .x86_64 => { + try self.code.ensureCapacity(self.code.items.len + 11); - // TODO omit this for naked functions - // push rbp - // mov rbp, rsp - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x55, 0x48, 0x89, 0xe5 }); + const cc = self.fn_type.fnCallingConvention(); + if (cc != .Naked) { + // We want to subtract the aligned stack frame size from rsp here, but we don't + // yet know how big it will be, so we leave room for a 4-byte stack size. + // TODO During semantic analysis, check if there are no function calls. If there + // are none, here we can omit the part where we subtract and then add rsp. + self.code.appendSliceAssumeCapacity(&[_]u8{ + // push rbp + 0x55, + // mov rbp, rsp + 0x48, + 0x89, + 0xe5, + // sub rsp, imm32 (with reloc) + 0x48, + 0x81, + 0xec, + }); + const reloc_index = self.code.items.len; + self.code.items.len += 4; - // sub rsp, x - const stack_end = self.branch_stack.items[0].max_end_stack; - if (stack_end > math.maxInt(i32)) { - return self.fail(self.src, "too much stack used in call parameters", .{}); - } else if (stack_end > math.maxInt(i8)) { - // 48 83 ec xx sub rsp,0x10 - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xec }); - const x = @intCast(u32, stack_end); - mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x); - } else if (stack_end != 0) { - // 48 81 ec xx xx xx xx sub rsp,0x80 - const x = @intCast(u8, stack_end); - self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xec, x }); + try self.genBody(self.mod_fn.analysis.success); + + const stack_end = self.branch_stack.items[0].max_end_stack; + if (stack_end > math.maxInt(i32)) + return self.fail(self.src, "too much stack used in call parameters", .{}); + const aligned_stack_end = mem.alignForward(stack_end, self.stack_align); + mem.writeIntLittle(u32, self.code.items[reloc_index..][0..4], @intCast(u32, aligned_stack_end)); + + if (self.code.items.len >= math.maxInt(i32)) { + return self.fail(self.src, "unable to perform relocation: jump too far", .{}); + } + for (self.exitlude_jump_relocs.items) |jmp_reloc| { + const amt = self.code.items.len - (jmp_reloc + 4); + // If it wouldn't jump at all, elide it. + if (amt == 0) { + self.code.items.len -= 5; + continue; + } + const s32_amt = @intCast(i32, amt); + mem.writeIntLittle(i32, self.code.items[jmp_reloc..][0..4], s32_amt); + } + + try self.code.ensureCapacity(self.code.items.len + 9); + // add rsp, x + if (aligned_stack_end > math.maxInt(i8)) { + // example: 48 81 c4 ff ff ff 7f add rsp,0x7fffffff + self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xc4 }); + const x = @intCast(u32, aligned_stack_end); + mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x); + } else if (aligned_stack_end != 0) { + // example: 48 83 c4 7f add rsp,0x7f + const x = @intCast(u8, aligned_stack_end); + self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xc4, x }); + } + + self.code.appendSliceAssumeCapacity(&[_]u8{ + 0x5d, // pop rbp + 0xc3, // ret + }); + } else { + try self.genBody(self.mod_fn.analysis.success); + } + }, + else => { + try self.genBody(self.mod_fn.analysis.success); + }, } - - try self.genBody(self.mod_fn.analysis.success); } fn genBody(self: *Self, body: ir.Body) InnerError!void { @@ -987,10 +1042,12 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { try self.code.append(0xc3); // ret }, .x86_64 => { - try self.code.appendSlice(&[_]u8{ - 0x5d, // pop rbp - 0xc3, // ret - }); + // TODO when implementing defer, this will need to jump to the appropriate defer expression. + // TODO optimization opportunity: figure out when we can emit this as a 2 byte instruction + // which is available if the jump is 127 bytes or less forward. + try self.code.resize(self.code.items.len + 5); + self.code.items[self.code.items.len - 5] = 0xe9; // jmp rel32 + try self.exitlude_jump_relocs.append(self.gpa, self.code.items.len - 4); }, else => return self.fail(src, "TODO implement return for {}", .{self.target.cpu.arch}), } @@ -1130,6 +1187,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { switch (reloc) { .rel32 => |pos| { const amt = self.code.items.len - (pos + 4); + // If it wouldn't jump at all, elide it. + if (amt == 0) { + self.code.items.len -= 5; + return; + } const s32_amt = math.cast(i32, amt) catch return self.fail(src, "unable to perform relocation: jump too far", .{}); mem.writeIntLittle(i32, self.code.items[pos..][0..4], s32_amt); @@ -1296,13 +1358,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type { const reg_id: u8 = @truncate(u3, reg.id()); if (stack_offset <= 128) { // example: 48 89 55 7f mov QWORD PTR [rbp+0x7f],rdx - const RM = @as(u8, 0b01_101_000) | reg_id; + const RM = @as(u8, 0b01_000_101) | (reg_id << 3); const negative_offset = @intCast(i8, -@intCast(i32, stack_offset)); const twos_comp = @bitCast(u8, negative_offset); self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM, twos_comp }); } else if (stack_offset <= 2147483648) { // example: 48 89 95 80 00 00 00 mov QWORD PTR [rbp+0x80],rdx - const RM = @as(u8, 0b10_101_000) | reg_id; + const RM = @as(u8, 0b10_000_101) | (reg_id << 3); const negative_offset = @intCast(i32, -@intCast(i33, stack_offset)); const twos_comp = @bitCast(u32, negative_offset); self.code.appendSliceAssumeCapacity(&[_]u8{ 0x89, RM });