diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig index 3f625f0be1..558ada3374 100644 --- a/lib/std/math/big/int_test.zig +++ b/lib/std/math/big/int_test.zig @@ -918,7 +918,6 @@ test "big.int mul multi-single" { test "big.int mul multi-multi" { if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; var op1: u256 = 0x998888efefefefefefefef; var op2: u256 = 0x333000abababababababab; @@ -1042,7 +1041,6 @@ test "big.int mulWrap single-single signed" { test "big.int mulWrap multi-multi unsigned" { if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; var op1: u256 = 0x998888efefefefefefefef; var op2: u256 = 0x333000abababababababab; diff --git a/lib/std/mem.zig b/lib/std/mem.zig index a2a8696402..feb41aedeb 100644 --- a/lib/std/mem.zig +++ b/lib/std/mem.zig @@ -635,7 +635,6 @@ test "lessThan" { const backend_can_use_eql_bytes = switch (builtin.zig_backend) { // The SPIR-V backend does not support the optimized path yet. .stage2_spirv64 => false, - .stage2_x86_64 => !std.Target.x86.featureSetHas(builtin.cpu.features, .avx2), else => true, }; diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index b3aaa5d1d3..e547b797ac 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -5727,30 +5727,13 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void { const mod = self.bin_file.comp.module.?; const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; const result = result: { + try self.spillEflagsIfOccupied(); + const dst_ty = self.typeOfIndex(inst); const src_ty = self.typeOf(ty_op.operand); if (src_ty.zigTypeTag(mod) == .Vector) return self.fail("TODO implement airClz for {}", .{ src_ty.fmt(mod), }); - const src_bits: u32 = @intCast(src_ty.bitSize(mod)); - - const has_lzcnt = self.hasFeature(.lzcnt); - if (src_bits > 64 and !has_lzcnt) { - var callee_buf: ["__clz?i2".len]u8 = undefined; - const result = try self.genCall(.{ .lib = .{ - .return_type = .i32_type, - .param_types = &.{src_ty.toIntern()}, - .callee = std.fmt.bufPrint(&callee_buf, "__clz{c}i2", .{ - intCompilerRtAbiName(src_bits), - }) catch unreachable, - } }, &.{src_ty}, &.{.{ .air_ref = ty_op.operand }}); - if (src_bits < 128) try self.asmRegisterImmediate( - .{ ._, .sub }, - result.register, - Immediate.u(128 - src_bits), - ); - break :result result; - } const src_mcv = try self.resolveInst(ty_op.operand); const mat_src_mcv = switch (src_mcv) { @@ -5768,6 +5751,61 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void { const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg); defer self.register_manager.unlockReg(dst_lock); + const abi_size: u31 = @intCast(src_ty.abiSize(mod)); + const src_bits: u31 = @intCast(src_ty.bitSize(mod)); + const has_lzcnt = self.hasFeature(.lzcnt); + if (src_bits > @as(u32, if (has_lzcnt) 128 else 64)) { + const limbs_len = math.divCeil(u32, abi_size, 8) catch unreachable; + const extra_bits = abi_size * 8 - src_bits; + + const index_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp); + const index_lock = self.register_manager.lockRegAssumeUnused(index_reg); + defer self.register_manager.unlockReg(index_lock); + + try self.asmRegisterImmediate(.{ ._, .mov }, index_reg.to32(), Immediate.u(limbs_len)); + switch (extra_bits) { + 1 => try self.asmRegisterRegister(.{ ._, .xor }, dst_reg.to32(), dst_reg.to32()), + else => try self.asmRegisterImmediate( + .{ ._, .mov }, + dst_reg.to32(), + Immediate.s(@as(i32, extra_bits) - 1), + ), + } + const loop: Mir.Inst.Index = @intCast(self.mir_instructions.len); + try self.asmRegisterRegister(.{ ._, .@"test" }, index_reg.to32(), index_reg.to32()); + const zero = try self.asmJccReloc(.z, undefined); + if (self.hasFeature(.slow_incdec)) { + try self.asmRegisterImmediate(.{ ._, .sub }, index_reg.to32(), Immediate.u(1)); + } else { + try self.asmRegister(.{ ._, .dec }, index_reg.to32()); + } + try self.asmMemoryImmediate(.{ ._, .cmp }, .{ + .base = .{ .frame = src_mcv.load_frame.index }, + .mod = .{ .rm = .{ + .size = .qword, + .index = index_reg.to64(), + .scale = .@"8", + .disp = src_mcv.load_frame.off, + } }, + }, Immediate.u(0)); + _ = try self.asmJccReloc(.e, loop); + try self.asmRegisterMemory(.{ ._, .bsr }, dst_reg.to64(), .{ + .base = .{ .frame = src_mcv.load_frame.index }, + .mod = .{ .rm = .{ + .size = .qword, + .index = index_reg.to64(), + .scale = .@"8", + .disp = src_mcv.load_frame.off, + } }, + }); + self.performReloc(zero); + try self.asmRegisterImmediate(.{ ._l, .sh }, index_reg.to32(), Immediate.u(6)); + try self.asmRegisterRegister(.{ ._, .add }, index_reg.to32(), dst_reg.to32()); + try self.asmRegisterImmediate(.{ ._, .mov }, dst_reg.to32(), Immediate.u(src_bits - 1)); + try self.asmRegisterRegister(.{ ._, .sub }, dst_reg.to32(), index_reg.to32()); + break :result dst_mcv; + } + if (has_lzcnt) { if (src_bits <= 8) { const wide_reg = try self.copyToTmpRegister(src_ty, mat_src_mcv); @@ -5785,7 +5823,8 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void { if (extra_bits > 0) { try self.genBinOpMir(.{ ._, .sub }, dst_ty, dst_mcv, .{ .immediate = extra_bits }); } - } else if (src_bits <= 128) { + } else { + assert(src_bits <= 128); const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp); const tmp_mcv = MCValue{ .register = tmp_reg }; const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); @@ -5818,12 +5857,12 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void { dst_mcv, .{ .immediate = 128 - src_bits }, ); - } else return self.fail("TODO airClz of {}", .{src_ty.fmt(mod)}); + } break :result dst_mcv; } - if (src_bits > 64) - return self.fail("TODO airClz of {}", .{src_ty.fmt(mod)}); + assert(src_bits <= 64); + const cmov_abi_size = @max(@as(u32, @intCast(dst_ty.abiSize(mod))), 2); if (math.isPowerOfTwo(src_bits)) { const imm_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits ^ (src_bits - 1), @@ -5840,7 +5879,6 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void { try self.genBinOpMir(.{ ._, .bsr }, Type.u16, dst_mcv, .{ .register = wide_reg }); } else try self.genBinOpMir(.{ ._, .bsr }, src_ty, dst_mcv, mat_src_mcv); - const cmov_abi_size = @max(@as(u32, @intCast(dst_ty.abiSize(mod))), 2); try self.asmCmovccRegisterRegister( .z, registerAlias(dst_reg, cmov_abi_size), @@ -5867,7 +5905,6 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void { .{ .register = wide_reg }, ); - const cmov_abi_size = @max(@as(u32, @intCast(dst_ty.abiSize(mod))), 2); try self.asmCmovccRegisterRegister( .nz, registerAlias(imm_reg, cmov_abi_size), @@ -5886,24 +5923,13 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void { const mod = self.bin_file.comp.module.?; const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; const result = result: { + try self.spillEflagsIfOccupied(); + const dst_ty = self.typeOfIndex(inst); const src_ty = self.typeOf(ty_op.operand); - if (src_ty.zigTypeTag(mod) == .Vector) return self.fail("TODO implement airClz for {}", .{ + if (src_ty.zigTypeTag(mod) == .Vector) return self.fail("TODO implement airCtz for {}", .{ src_ty.fmt(mod), }); - const src_bits: u32 = @intCast(src_ty.bitSize(mod)); - - const has_bmi = self.hasFeature(.bmi); - if (src_bits > 64 and !has_bmi) { - var callee_buf: ["__ctz?i2".len]u8 = undefined; - break :result try self.genCall(.{ .lib = .{ - .return_type = .i32_type, - .param_types = &.{src_ty.toIntern()}, - .callee = std.fmt.bufPrint(&callee_buf, "__ctz{c}i2", .{ - intCompilerRtAbiName(src_bits), - }) catch unreachable, - } }, &.{src_ty}, &.{.{ .air_ref = ty_op.operand }}); - } const src_mcv = try self.resolveInst(ty_op.operand); const mat_src_mcv = switch (src_mcv) { @@ -5921,8 +5947,62 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void { const dst_lock = self.register_manager.lockReg(dst_reg); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); + const abi_size: u31 = @intCast(src_ty.abiSize(mod)); + const src_bits: u31 = @intCast(src_ty.bitSize(mod)); + const has_bmi = self.hasFeature(.bmi); + if (src_bits > @as(u32, if (has_bmi) 128 else 64)) { + const limbs_len = math.divCeil(u32, abi_size, 8) catch unreachable; + const extra_bits = abi_size * 8 - src_bits; + + const index_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp); + const index_lock = self.register_manager.lockRegAssumeUnused(index_reg); + defer self.register_manager.unlockReg(index_lock); + + try self.asmRegisterImmediate(.{ ._, .mov }, index_reg.to32(), Immediate.s(-1)); + switch (extra_bits) { + 0 => try self.asmRegisterRegister(.{ ._, .xor }, dst_reg.to32(), dst_reg.to32()), + 1 => try self.asmRegisterRegister(.{ ._, .mov }, dst_reg.to32(), dst_reg.to32()), + else => try self.asmRegisterImmediate( + .{ ._, .mov }, + dst_reg.to32(), + Immediate.s(-@as(i32, extra_bits)), + ), + } + const loop: Mir.Inst.Index = @intCast(self.mir_instructions.len); + if (self.hasFeature(.slow_incdec)) { + try self.asmRegisterImmediate(.{ ._, .add }, index_reg.to32(), Immediate.u(1)); + } else { + try self.asmRegister(.{ ._, .inc }, index_reg.to32()); + } + try self.asmRegisterImmediate(.{ ._, .cmp }, index_reg.to32(), Immediate.u(limbs_len)); + const zero = try self.asmJccReloc(.nb, undefined); + try self.asmMemoryImmediate(.{ ._, .cmp }, .{ + .base = .{ .frame = src_mcv.load_frame.index }, + .mod = .{ .rm = .{ + .size = .qword, + .index = index_reg.to64(), + .scale = .@"8", + .disp = src_mcv.load_frame.off, + } }, + }, Immediate.u(0)); + _ = try self.asmJccReloc(.e, loop); + try self.asmRegisterMemory(.{ ._, .bsf }, dst_reg.to64(), .{ + .base = .{ .frame = src_mcv.load_frame.index }, + .mod = .{ .rm = .{ + .size = .qword, + .index = index_reg.to64(), + .scale = .@"8", + .disp = src_mcv.load_frame.off, + } }, + }); + self.performReloc(zero); + try self.asmRegisterImmediate(.{ ._l, .sh }, index_reg.to32(), Immediate.u(6)); + try self.asmRegisterRegister(.{ ._, .add }, dst_reg.to32(), index_reg.to32()); + break :result dst_mcv; + } + const wide_ty = if (src_bits <= 8) Type.u16 else src_ty; - if (self.hasFeature(.bmi)) { + if (has_bmi) { if (src_bits <= 64) { const extra_bits = self.regExtraBits(src_ty) + @as(u64, if (src_bits <= 8) 8 else 0); const masked_mcv = if (extra_bits > 0) masked: { @@ -5942,7 +6022,8 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void { break :masked tmp_mcv; } else mat_src_mcv; try self.genBinOpMir(.{ ._, .tzcnt }, wide_ty, dst_mcv, masked_mcv); - } else if (src_bits <= 128) { + } else { + assert(src_bits <= 128); const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp); const tmp_mcv = MCValue{ .register = tmp_reg }; const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); @@ -5970,12 +6051,11 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void { try self.genBinOpMir(.{ ._, .add }, dst_ty, dst_mcv, .{ .immediate = 64 }); try self.genBinOpMir(.{ ._, .tzcnt }, Type.u64, tmp_mcv, lo_mat_src_mcv); try self.asmCmovccRegisterRegister(.nc, dst_reg.to32(), tmp_reg.to32()); - } else return self.fail("TODO airCtz of {}", .{src_ty.fmt(mod)}); + } break :result dst_mcv; } - if (src_bits > 64) return self.fail("TODO airCtz of {}", .{src_ty.fmt(mod)}); - + assert(src_bits <= 64); const width_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits }); const width_lock = self.register_manager.lockRegAssumeUnused(width_reg); defer self.register_manager.unlockReg(width_lock); @@ -6158,7 +6238,7 @@ fn genByteSwap( ) !MCValue { const mod = self.bin_file.comp.module.?; const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; - const have_movbe = self.hasFeature(.movbe); + const has_movbe = self.hasFeature(.movbe); if (src_ty.zigTypeTag(mod) == .Vector) return self.fail( "TODO implement genByteSwap for {}", @@ -6206,11 +6286,11 @@ fn genByteSwap( for (dst_regs, 0..) |dst_reg, limb_index| { if (src_mcv.isMemory()) { try self.asmRegisterMemory( - .{ ._, if (have_movbe) .movbe else .mov }, + .{ ._, if (has_movbe) .movbe else .mov }, dst_reg.to64(), try src_mcv.address().offset(@intCast(limb_index * 8)).deref().mem(self, .qword), ); - if (!have_movbe) try self.asmRegister(.{ ._, .bswap }, dst_reg.to64()); + if (!has_movbe) try self.asmRegister(.{ ._, .bswap }, dst_reg.to64()); } else { try self.asmRegisterRegister( .{ ._, .mov }, @@ -6227,9 +6307,8 @@ fn genByteSwap( const temp_regs = try self.register_manager.allocRegs(4, .{null} ** 4, abi.RegisterClass.gp); - const temp_locks = self.register_manager.lockRegs(4, temp_regs); - defer for (temp_locks) |temp_lock| if (temp_lock) |lock| - self.register_manager.unlockReg(lock); + const temp_locks = self.register_manager.lockRegsAssumeUnused(4, temp_regs); + defer for (temp_locks) |lock| self.register_manager.unlockReg(lock); const dst_mcv = try self.allocRegOrMem(inst, false); try self.asmRegisterRegister(.{ ._, .xor }, temp_regs[0].to32(), temp_regs[0].to32()); @@ -6241,7 +6320,7 @@ fn genByteSwap( const loop: Mir.Inst.Index = @intCast(self.mir_instructions.len); try self.asmRegisterMemory( - .{ ._, if (have_movbe) .movbe else .mov }, + .{ ._, if (has_movbe) .movbe else .mov }, temp_regs[2].to64(), .{ .base = .{ .frame = dst_mcv.load_frame.index }, @@ -6254,7 +6333,7 @@ fn genByteSwap( }, ); try self.asmRegisterMemory( - .{ ._, if (have_movbe) .movbe else .mov }, + .{ ._, if (has_movbe) .movbe else .mov }, temp_regs[3].to64(), .{ .base = .{ .frame = dst_mcv.load_frame.index }, @@ -6266,7 +6345,7 @@ fn genByteSwap( } }, }, ); - if (!have_movbe) { + if (!has_movbe) { try self.asmRegister(.{ ._, .bswap }, temp_regs[2].to64()); try self.asmRegister(.{ ._, .bswap }, temp_regs[3].to64()); } @@ -6301,7 +6380,7 @@ fn genByteSwap( }, } - const dst_mcv: MCValue = if (mem_ok and have_movbe and src_mcv.isRegister()) + const dst_mcv: MCValue = if (mem_ok and has_movbe and src_mcv.isRegister()) try self.allocRegOrMem(inst, true) else .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.gp) }; @@ -6359,8 +6438,8 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void { defer for (dst_locks) |dst_lock| if (dst_lock) |lock| self.register_manager.unlockReg(lock); const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp); - const tmp_lock = self.register_manager.lockReg(tmp_reg); - defer if (tmp_lock) |lock| self.register_manager.unlockReg(lock); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); const limb_abi_size: u32 = @min(abi_size, 8); const tmp = registerAlias(tmp_reg, limb_abi_size); @@ -8067,8 +8146,8 @@ fn genShiftBinOpMir( defer if (rcx_lock) |lock| self.register_manager.unlockReg(lock); const temp_regs = try self.register_manager.allocRegs(4, .{null} ** 4, abi.RegisterClass.gp); - const temp_locks = self.register_manager.lockRegs(4, temp_regs); - defer for (temp_locks) |temp_lock| if (temp_lock) |lock| self.register_manager.unlockReg(lock); + const temp_locks = self.register_manager.lockRegsAssumeUnused(4, temp_regs); + defer for (temp_locks) |lock| self.register_manager.unlockReg(lock); switch (tag[0]) { ._l => { @@ -8857,9 +8936,8 @@ fn genMulDivBinOp( const temp_regs = try self.register_manager.allocRegs(4, .{null} ** 4, abi.RegisterClass.gp); - const temp_locks = self.register_manager.lockRegs(4, temp_regs); - defer for (temp_locks) |temp_lock| if (temp_lock) |lock| - self.register_manager.unlockReg(lock); + const temp_locks = self.register_manager.lockRegsAssumeUnused(4, temp_regs); + defer for (temp_locks) |lock| self.register_manager.unlockReg(lock); try self.asmRegisterRegister(.{ ._, .xor }, temp_regs[0].to32(), temp_regs[0].to32()); @@ -9543,9 +9621,8 @@ fn genBinOp( .{null} ** 2, abi.RegisterClass.gp, ); - const dst_regs_locks = self.register_manager.lockRegs(2, dst_regs); - defer for (dst_regs_locks) |dst_lock| if (dst_lock) |lock| - self.register_manager.unlockReg(lock); + const dst_regs_locks = self.register_manager.lockRegsAssumeUnused(2, dst_regs); + defer for (dst_regs_locks) |lock| self.register_manager.unlockReg(lock); try self.genCopy(lhs_ty, .{ .register_pair = dst_regs }, dst_mcv, .{}); break :dst dst_regs;