x86_64: implement integer @reduce(.Mul)

This commit is contained in:
Jacob Young 2025-05-21 12:27:24 -04:00
parent 1f6f8b0ffe
commit 7bfdb7f26d
6 changed files with 3830 additions and 141 deletions

File diff suppressed because it is too large Load diff

View file

@ -336,7 +336,7 @@ pub const Mnemonic = enum {
fcom, fcomi, fcomip, fcomp, fcompp, fcos,
fdecstp, fdiv, fdivp, fdivr, fdivrp, ffree,
fiadd, ficom, ficomp, fidiv, fidivr, fild, fimul, fincstp, finit,
fist, fistp, fisttp, fisub, fisubr,
fist, fistp, fisub, fisubr,
fld, fld1, fldcw, fldenv, fldl2e, fldl2t, fldlg2, fldln2, fldpi, fldz,
fmul, fmulp,
fnclex, fninit, fnop, fnsave, fnstcw, fnstenv, fnstsw,
@ -349,19 +349,18 @@ pub const Mnemonic = enum {
// MMX
emms, movd, movq,
packssdw, packsswb, packuswb,
paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw,
paddb, paddd, paddsb, paddsw, paddusb, paddusw, paddw,
pand, pandn, por, pxor,
pcmpeqb, pcmpeqd, pcmpeqw,
pcmpgtb, pcmpgtd, pcmpgtw,
pmulhw, pmullw,
pmaddwd, pmulhw, pmullw,
pslld, psllq, psllw,
psrad, psraw,
psrld, psrlq, psrlw,
psubb, psubd, psubq, psubsb, psubsw, psubusb, psubusw, psubw,
psubb, psubd, psubsb, psubsw, psubusb, psubusw, psubw,
// SSE
addps, addss,
andps,
andnps,
andnps, andps,
cmpps, cmpss, comiss,
cvtpi2ps, cvtps2pi, cvtsi2ss, cvtss2si, cvttps2pi, cvttss2si,
divps, divss,
@ -374,9 +373,11 @@ pub const Mnemonic = enum {
movss, movups,
mulps, mulss,
orps,
pavgb, pavgw,
pextrw, pinsrw,
pmaxsw, pmaxub, pminsw, pminub, pmovmskb,
pmaxsw, pmaxub, pminsw, pminub, pmovmskb, pmulhuw,
prefetchit0, prefetchit1, prefetchnta, prefetcht0, prefetcht1, prefetcht2, prefetchw, prefetchwt1,
psadbw, pshufw,
shufps,
sqrtps, sqrtss,
stmxcsr,
@ -397,15 +398,16 @@ pub const Mnemonic = enum {
maxpd, maxsd,
minpd, minsd,
movapd,
movdqa, movdqu,
movdq2q, movdqa, movdqu,
movhpd, movlpd,
movmskpd,
movmskpd, movq2dq,
//movsd,
movupd,
mulpd, mulsd,
orpd,
paddq, pmuludq,
pshufd, pshufhw, pshuflw,
pslldq, psrldq,
pslldq, psrldq, psubq,
punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
punpcklbw, punpckldq, punpcklqdq, punpcklwd,
shufpd,
@ -414,9 +416,17 @@ pub const Mnemonic = enum {
ucomisd, unpckhpd, unpcklpd,
xorpd,
// SSE3
addsubpd, addsubps, haddpd, haddps, lddqu, movddup, movshdup, movsldup,
addsubpd, addsubps,
fisttp,
haddpd, haddps,
hsubpd, hsubps,
lddqu,
movddup, movshdup, movsldup,
// SSSE3
pabsb, pabsd, pabsw, palignr, pshufb,
pabsb, pabsd, pabsw, palignr,
phaddw, phaddsw, phaddd, phsubw, phsubsw, phsubd,
pmaddubsw, pmulhrsw, pshufb,
psignb, psignd, psignw,
// SSE4.1
blendpd, blendps, blendvpd, blendvps,
dppd, dpps,
@ -430,7 +440,7 @@ pub const Mnemonic = enum {
pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw,
pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq,
pmovzxbd, pmovzxbq, pmovzxbw, pmovzxdq, pmovzxwd, pmovzxwq,
pmulld,
pmuldq, pmulld,
ptest,
roundpd, roundps, roundsd, roundss,
// SSE4.2
@ -458,7 +468,7 @@ pub const Mnemonic = enum {
vdppd, vdpps,
vextractf128, vextractps,
vgf2p8affineinvqb, vgf2p8affineqb, vgf2p8mulb,
vhaddpd, vhaddps,
vhaddpd, vhaddps, vhsubpd, vhsubps,
vinsertf128, vinsertps,
vlddqu, vldmxcsr,
vmaskmovpd, vmaskmovps,
@ -480,21 +490,24 @@ pub const Mnemonic = enum {
vpabsb, vpabsd, vpabsw,
vpackssdw, vpacksswb, vpackusdw, vpackuswb,
vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw,
vpalignr, vpand, vpandn,
vpalignr, vpand, vpandn, vpavgb, vpavgw,
vpblendvb, vpblendw, vpclmulqdq,
vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
vphaddw, vphaddsw, vphaddd, vphsubw, vphsubsw, vphsubd,
vperm2f128, vpermilpd, vpermilps,
vpextrb, vpextrd, vpextrq, vpextrw,
vpinsrb, vpinsrd, vpinsrq, vpinsrw,
vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw,
vpmaddubsw,
vpmovmskb,
vpmovsxbd, vpmovsxbq, vpmovsxbw, vpmovsxdq, vpmovsxwd, vpmovsxwq,
vpmovzxbd, vpmovzxbq, vpmovzxbw, vpmovzxdq, vpmovzxwd, vpmovzxwq,
vpmulhw, vpmulld, vpmullw,
vpmuldq, vpmulhrsw, vpmulhw, vpmulld, vpmullw, vpmuludq,
vpor,
vpshufb, vpshufd, vpshufhw, vpshuflw,
vpsignb, vpsignd, vpsignw,
vpslld, vpslldq, vpsllq, vpsllw,
vpsrad, vpsraq, vpsraw,
vpsrld, vpsrldq, vpsrlq, vpsrlw,
@ -779,7 +792,7 @@ pub const Op = enum {
pub fn isImmediate(op: Op) bool {
// zig fmt: off
return switch (op) {
.imm8, .imm16, .imm32, .imm64,
.imm8, .imm16, .imm32, .imm64,
.imm8s, .imm16s, .imm32s,
.rel8, .rel16, .rel32,
.unity,
@ -986,6 +999,7 @@ pub const Feature = enum {
sse,
sse2,
sse3,
@"sse3 x87",
sse4_1,
sse4_2,
ssse3,

View file

@ -567,7 +567,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
}
fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
@setEvalBranchQuota(2_500);
@setEvalBranchQuota(2_600);
const fixes = switch (inst.ops) {
.none => inst.data.none.fixes,
.inst => inst.data.inst.fixes,

View file

@ -386,7 +386,10 @@ pub const Inst = struct {
/// Packed ___ Quadword
p_q,
/// Packed ___ Double Quadword
/// Packed ___ Doubleword to Quadword
p_dq,
/// Packed ___ Unsigned Doubleword to Quadword
p_udq,
/// ___ Aligned Packed Integer Values
_dqa,
/// ___ Unaligned Packed Integer Values
@ -446,7 +449,10 @@ pub const Inst = struct {
/// VEX-Encoded Packed ___ Quadword
vp_q,
/// VEX-Encoded Packed ___ Double Quadword
/// VEX-Encoded Packed ___ Doubleword to Quadword
vp_dq,
/// VEX-Encoded Packed ___ Unsigned Doubleword to Quadword
vp_udq,
/// VEX-Encoded ___ Scalar Single-Precision Values
v_ss,
/// VEX-Encoded ___ Packed Single-Precision Values
@ -663,6 +669,8 @@ pub const Inst = struct {
/// Multiply scalar single-precision floating-point values
/// Multiply packed double-precision floating-point values
/// Multiply scalar double-precision floating-point values
/// Multiply packed unsigned doubleword integers
/// Multiply packed doubleword integers
mul,
/// Two's complement negation
neg,

View file

@ -1160,10 +1160,6 @@
.{ .fistp, .m, .{ .m32 }, .{ 0xdb }, 3, .none, .x87 },
.{ .fistp, .m, .{ .m64 }, .{ 0xdf }, 7, .none, .x87 },
.{ .fisttp, .m, .{ .m16 }, .{ 0xdf }, 1, .none, .x87 },
.{ .fisttp, .m, .{ .m32 }, .{ 0xdb }, 1, .none, .x87 },
.{ .fisttp, .m, .{ .m64 }, .{ 0xdd }, 1, .none, .x87 },
.{ .fld, .m, .{ .m32 }, .{ 0xd9 }, 0, .none, .x87 },
.{ .fld, .m, .{ .m64 }, .{ 0xdd }, 0, .none, .x87 },
.{ .fld, .m, .{ .m80 }, .{ 0xdb }, 5, .none, .x87 },
@ -1540,6 +1536,8 @@
.{ .pmullw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xd5 }, 0, .none, .sse2 },
.{ .pmuludq, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf4 }, 0, .none, .sse2 },
.{ .por, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .none, .sse2 },
.{ .pshufd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .none, .sse2 },
@ -1618,6 +1616,10 @@
.{ .addsubps, .rm, .{ .xmm, .xmm_m128 }, .{ 0xf2, 0x0f, 0xd0 }, 0, .none, .sse3 },
.{ .fisttp, .m, .{ .m16 }, .{ 0xdf }, 1, .none, .@"sse3 x87" },
.{ .fisttp, .m, .{ .m32 }, .{ 0xdb }, 1, .none, .@"sse3 x87" },
.{ .fisttp, .m, .{ .m64 }, .{ 0xdd }, 1, .none, .@"sse3 x87" },
.{ .haddpd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x7c }, 0, .none, .sse3 },
.{ .haddps, .rm, .{ .xmm, .xmm_m128 }, .{ 0xf2, 0x0f, 0x7c }, 0, .none, .sse3 },
@ -1708,6 +1710,8 @@
.{ .pmovzxwq, .rm, .{ .xmm, .xmm_m32 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .none, .sse4_1 },
.{ .pmovzxdq, .rm, .{ .xmm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .none, .sse4_1 },
.{ .pmuldq, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .none, .sse4_1 },
.{ .pmulld, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 },
.{ .ptest, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x17 }, 0, .none, .sse4_1 },
@ -2166,12 +2170,16 @@
.{ .vpmovzxwq, .rm, .{ .xmm, .xmm_m32 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_128_wig, .avx },
.{ .vpmovzxdq, .rm, .{ .xmm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_128_wig, .avx },
.{ .vpmuldq, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_128_wig, .avx },
.{ .vpmulhw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx },
.{ .vpmulld, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx },
.{ .vpmullw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xd5 }, 0, .vex_128_wig, .avx },
.{ .vpmuludq, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf4 }, 0, .vex_128_wig, .avx },
.{ .vpor, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx },
.{ .vpshufb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_128_wig, .avx },
@ -2493,12 +2501,16 @@
.{ .vpmovzxwq, .rm, .{ .ymm, .xmm_m64 }, .{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_256_wig, .avx2 },
.{ .vpmovzxdq, .rm, .{ .ymm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_256_wig, .avx2 },
.{ .vpmuldq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_256_wig, .avx2 },
.{ .vpmulhw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx2 },
.{ .vpmulld, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx2 },
.{ .vpmullw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xd5 }, 0, .vex_256_wig, .avx2 },
.{ .vpmuludq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xf4 }, 0, .vex_256_wig, .avx2 },
.{ .vpor, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 },
.{ .vpshufb, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_256_wig, .avx2 },

View file

@ -4889,6 +4889,14 @@ test reduceAdd {
try test_reduce_add.testIntVectors();
}
inline fn reduceMul(comptime Type: type, rhs: Type) @typeInfo(Type).vector.child {
return @reduce(.Mul, rhs);
}
test reduceMul {
const test_reduce_mul = unary(reduceMul, .{});
try test_reduce_mul.testIntVectors();
}
inline fn splat(comptime Type: type, rhs: Type) Type {
return @splat(rhs[0]);
}