Merge pull request #22913 from jacobly0/x86_64-rewrite

x86_64: rewrite unsafe int vector multiplication
2025-12-06 13:54:21 +00:00 · 2025-02-17 16:13:22 -08:00 · 2025-02-17 16:13:22 -08:00 · d2e70ef84a
commit d2e70ef84a
parent b732070fd3 ff74127526
5 changed files with 6558 additions and 276 deletions
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
--- a/src/arch/x86_64/Encoding.zig
+++ b/src/arch/x86_64/Encoding.zig
@ -78,7 +78,7 @@ pub fn findByMnemonic(
                ),
                .x86_64 => false,
            },
-            inline .@"invpcid 64bit", .@"rdpid 64bit" => |tag| switch (target.cpu.arch) {
+            inline .@"invpcid 64bit", .@"rdpid 64bit", .@"prefetchi 64bit" => |tag| switch (target.cpu.arch) {
                else => unreachable,
                .x86 => false,
                .x86_64 => std.Target.x86.featureSetHas(
@ -86,6 +86,7 @@ pub fn findByMnemonic(
                    @field(std.Target.x86.Feature, @tagName(tag)[0 .. @tagName(tag).len - " 64bit".len]),
                ),
            },
+            .prefetch => std.Target.x86.featureSetHasAny(target.cpu.features, .{ .sse, .prfchw, .prefetchi, .prefetchwt1 }),
            inline else => |tag| has_features: {
                comptime var feature_it = std.mem.splitScalar(u8, @tagName(tag), ' ');
                comptime var features: []const std.Target.x86.Feature = &.{};
@ -375,6 +376,7 @@ pub const Mnemonic = enum {
    orps,
    pextrw, pinsrw,
    pmaxsw, pmaxub, pminsw, pminub, pmovmskb,
+    prefetchit0, prefetchit1, prefetchnta, prefetcht0, prefetcht1, prefetcht2, prefetchw, prefetchwt1,
    shufps,
    sqrtps, sqrtss,
    stmxcsr,
@ -459,6 +461,7 @@ pub const Mnemonic = enum {
    vhaddpd, vhaddps,
    vinsertf128, vinsertps,
    vlddqu, vldmxcsr,
+    vmaskmovpd, vmaskmovps,
    vmaxpd, vmaxps, vmaxsd, vmaxss,
    vminpd, vminps, vminsd, vminss,
    vmovapd, vmovaps,
@ -481,6 +484,7 @@ pub const Mnemonic = enum {
    vpblendvb, vpblendw, vpclmulqdq,
    vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
    vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
+    vperm2f128, vpermilpd, vpermilps,
    vpextrb, vpextrd, vpextrq, vpextrw,
    vpinsrb, vpinsrd, vpinsrq, vpinsrw,
    vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
@ -521,6 +525,9 @@ pub const Mnemonic = enum {
    // AVX2
    vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
    vextracti128, vinserti128, vpblendd,
+    vperm2i128, vpermd, vpermpd, vpermps, vpermq,
+    vpmaskmovd, vpmaskmovq,
+    vpsllvd, vpsllvq, vpsravd, vpsrlvd, vpsrlvq,
    // ADX
    adcx, adox,
    // AESKLE
@ -557,8 +564,7 @@ pub const Op = enum {
    r32_m8, r32_m16, r64_m16,
    m8, m16, m32, m64, m80, m128, m256,
    rel8, rel16, rel32,
-    m,
-    moffs,
+    m, moffs, mrip8,
    sreg,
    st0, st, mm, mm_m64,
    xmm0, xmm, xmm_m8, xmm_m16, xmm_m32, xmm_m64, xmm_m128,
@ -612,7 +618,7 @@ pub const Op = enum {

            .mem => |mem| switch (mem) {
                .moffs => .moffs,
-                .sib, .rip => switch (mem.bitSize(target)) {
+                .sib => switch (mem.bitSize(target)) {
                    0 => .m,
                    8 => .m8,
                    16 => .m16,
@ -623,6 +629,16 @@ pub const Op = enum {
                    256 => .m256,
                    else => unreachable,
                },
+                .rip => switch (mem.bitSize(target)) {
+                    0, 8 => .mrip8,
+                    16 => .m16,
+                    32 => .m32,
+                    64 => .m64,
+                    80 => .m80,
+                    128 => .m128,
+                    256 => .m256,
+                    else => unreachable,
+                },
            },

            .imm => |imm| switch (imm) {
@ -675,7 +691,7 @@ pub const Op = enum {

    pub fn immBitSize(op: Op) u64 {
        return switch (op) {
-            .none, .moffs, .m, .sreg => unreachable,
+            .none, .m, .moffs, .mrip8, .sreg => unreachable,
            .al, .cl, .dx, .rip, .eip, .ip, .r8, .rm8, .r32_m8 => unreachable,
            .ax, .r16, .rm16 => unreachable,
            .eax, .r32, .rm32, .r32_m16 => unreachable,
@ -695,7 +711,7 @@ pub const Op = enum {

    pub fn regBitSize(op: Op) u64 {
        return switch (op) {
-            .none, .moffs, .m, .sreg => unreachable,
+            .none, .m, .moffs, .mrip8, .sreg => unreachable,
            .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
            .rel8, .rel16, .rel32 => unreachable,
            .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable,
@ -711,13 +727,13 @@ pub const Op = enum {

    pub fn memBitSize(op: Op) u64 {
        return switch (op) {
-            .none, .moffs, .m, .sreg => unreachable,
+            .none, .m, .moffs, .sreg => unreachable,
            .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
            .rel8, .rel16, .rel32 => unreachable,
            .al, .cl, .r8, .ax, .dx, .ip, .r16, .eax, .eip, .r32, .rax, .rip, .r64 => unreachable,
            .st0, .st, .mm, .xmm0, .xmm, .ymm => unreachable,
            .cr, .dr => unreachable,
-            .m8, .rm8, .r32_m8, .xmm_m8 => 8,
+            .mrip8, .m8, .rm8, .r32_m8, .xmm_m8 => 8,
            .m16, .rm16, .r32_m16, .r64_m16, .xmm_m16 => 16,
            .m32, .rm32, .xmm_m32 => 32,
            .m64, .rm64, .mm_m64, .xmm_m64 => 64,
@ -778,7 +794,7 @@ pub const Op = enum {
            .rm8, .rm16, .rm32, .rm64,
            .r32_m8, .r32_m16, .r64_m16,
            .m8, .m16, .m32, .m64, .m80, .m128, .m256,
-            .m,
+            .m, .moffs, .mrip8,
            .mm_m64,
            .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128,
            .ymm_m256,
@ -816,11 +832,7 @@ pub const Op = enum {
    /// Given an operand `op` checks if `target` is a subset for the purposes of the encoding.
    pub fn isSubset(op: Op, target: Op) bool {
        switch (op) {
-            .moffs, .sreg => return op == target,
-            .none => switch (target) {
-                .none => return true,
-                else => return false,
-            },
+            .none, .m, .moffs, .sreg => return op == target,
            else => {
                if (op.isRegister() and target.isRegister()) {
                    return switch (target.toReg()) {
@ -831,6 +843,7 @@ pub const Op = enum {
                if (op.isMemory() and target.isMemory()) {
                    switch (target) {
                        .m => return true,
+                        .moffs, .mrip8 => return op == target,
                        else => return op.memBitSize() == target.memBitSize(),
                    }
                }
@ -957,6 +970,10 @@ pub const Feature = enum {
    @"pclmul avx",
    pku,
    popcnt,
+    prefetch,
+    @"prefetchi 64bit",
+    prefetchwt1,
+    prfchw,
    rdrnd,
    rdseed,
    @"rdpid 32bit",
@ -997,7 +1014,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op
 }

 const mnemonic_to_encodings_map = init: {
-    @setEvalBranchQuota(5_600);
+    @setEvalBranchQuota(5_800);
    const mnemonic_count = @typeInfo(Mnemonic).@"enum".fields.len;
    var mnemonic_map: [mnemonic_count][]Data = @splat(&.{});
    const encodings = @import("encodings.zig");
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@ -34,8 +34,18 @@ pub const Inst = struct {
        /// ___ 4
        _4,

+        /// ___ With NTA Hint
+        _nta,
        /// System Call ___
        sys_,
+        /// ___ With T0 Hint
+        _t0,
+        /// ___ With T1 Hint
+        _t1,
+        /// ___ With T2 Hint
+        _t2,
+        /// ___ With Intent to Write and T1 Hint
+        _wt1,

        /// ___ crement Shadow Stack Pointer Doubleword
        _csspd,
@ -198,6 +208,7 @@ pub const Inst = struct {
        //_b,
        /// ___ Word
        /// ___ For Writing
+        /// ___ With Intent to Write
        _w,
        /// ___ Doubleword
        //_d,
@ -756,6 +767,8 @@ pub const Inst = struct {
        /// Swap GS base register
        swapgs,
        /// Test condition
+        /// Logical compare
+        /// Packed bit test
        @"test",
        /// Undefined instruction
        ud,
@ -973,6 +986,9 @@ pub const Inst = struct {
        /// Move unaligned packed single-precision floating-point values
        /// Move unaligned packed double-precision floating-point values
        movu,
+        /// Prefetch data into caches
+        /// Prefetch data into caches with intent to write
+        prefetch,
        /// Packed interleave shuffle of quadruplets of single-precision floating-point values
        /// Packed interleave shuffle of pairs of double-precision floating-point values
        /// Shuffle packed doublewords
@ -1053,6 +1069,7 @@ pub const Inst = struct {
        /// Blend scalar single-precision floating-point values
        /// Blend packed double-precision floating-point values
        /// Blend scalar double-precision floating-point values
+        /// Blend packed dwords
        blend,
        /// Variable blend packed single-precision floating-point values
        /// Variable blend scalar single-precision floating-point values
@ -1127,20 +1144,37 @@ pub const Inst = struct {
        sha256rnds,

        // AVX
+        /// Load with broadcast floating-point data
+        /// Load integer and broadcast
+        broadcast,
+        /// Conditional SIMD packed loads and stores
+        /// Condition SIMD integer packed loads and stores
+        maskmov,
+        /// Permute floating-point values
+        /// Permute integer values
+        perm2,
+        /// Permute in-lane pairs of double-precision floating-point values
+        /// Permute in-lane quadruples of single-precision floating-point values
+        permil,
+
+        // BMI
        /// Bit field extract
        bextr,
        /// Extract lowest set isolated bit
        /// Get mask up to lowest set bit
        /// Reset lowest set bit
        bls,
-        /// Load with broadcast floating-point data
-        /// Load integer and broadcast
-        broadcast,
-        /// Zero high bits starting with specified bit position
-        bzhi,
        /// Count the number of trailing zero bits
        tzcnt,

+        // BMI2
+        /// Zero high bits starting with specified bit position
+        bzhi,
+        /// Parallel bits deposit
+        pdep,
+        /// Parallel bits extract
+        pext,
+
        // F16C
        /// Convert 16-bit floating-point values to single-precision floating-point values
        cvtph2,
@ -1164,6 +1198,19 @@ pub const Inst = struct {
        /// Fused multiply-add of scalar double-precision floating-point values
        fmadd231,

+        // AVX2
+        /// Permute packed doubleword elements
+        /// Permute packed qword elements
+        /// Permute double-precision floating-point elements
+        /// Permute single-precision floating-point elements
+        perm,
+        /// Variable bit shift left logical
+        sllv,
+        /// Variable bit shift right arithmetic
+        srav,
+        /// Variable bit shift right logical
+        srlv,
+
        // ADX
        /// Unsigned integer addition of two operands with overflow flag
        ado,
--- a/src/arch/x86_64/encodings.zig
+++ b/src/arch/x86_64/encodings.zig
@ -1370,6 +1370,18 @@ pub const table = [_]Entry{
    .{ .pmovmskb, .rm, &.{ .r32, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .none, .sse },
    .{ .pmovmskb, .rm, &.{ .r64, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .none, .sse },

+    .{ .prefetchit0, .m, &.{ .mrip8 }, &.{ 0x0f, 0x18 }, 7, .none, .@"prefetchi 64bit" },
+    .{ .prefetchit1, .m, &.{ .mrip8 }, &.{ 0x0f, 0x18 }, 6, .none, .@"prefetchi 64bit" },
+
+    .{ .prefetchnta, .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 0, .none, .prefetch },
+    .{ .prefetcht0,  .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 1, .none, .prefetch },
+    .{ .prefetcht1,  .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 2, .none, .prefetch },
+    .{ .prefetcht2,  .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 3, .none, .prefetch },
+
+    .{ .prefetchw, .m, &.{ .m8 }, &.{ 0x0f, 0x0d }, 1, .none, .prfchw },
+
+    .{ .prefetchwt1, .m, &.{ .m8 }, &.{ 0x0f, 0x0d }, 2, .none, .prefetchwt1 },
+
    .{ .shufps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .none, .sse },

    .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse },
@ -1932,6 +1944,15 @@ pub const table = [_]Entry{

    .{ .vldmxcsr, .m, &.{ .m32 }, &.{ 0x0f, 0xae }, 2, .vex_lz_wig, .avx },

+    .{ .vmaskmovps, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x2c }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovps, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x2c }, 0, .vex_256_w0, .avx },
+    .{ .vmaskmovpd, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x2d }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovpd, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x2d }, 0, .vex_256_w0, .avx },
+    .{ .vmaskmovps, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x2e }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovps, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x2e }, 0, .vex_256_w0, .avx },
+    .{ .vmaskmovpd, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x2f }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovpd, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x2f }, 0, .vex_256_w0, .avx },
+
    .{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
    .{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx },

@ -2097,6 +2118,18 @@ pub const table = [_]Entry{

    .{ .vpcmpgtq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_128_wig, .avx },

+    .{ .vperm2f128, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x06 }, 0, .vex_256_w0, .avx },
+
+    .{ .vpermilpd, .rvm, &.{ .xmm, .xmm,      .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0d }, 0, .vex_128_w0, .avx },
+    .{ .vpermilpd, .rvm, &.{ .ymm, .ymm,      .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0d }, 0, .vex_256_w0, .avx },
+    .{ .vpermilpd, .rmi, &.{ .xmm, .xmm_m128, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x05 }, 0, .vex_128_w0, .avx },
+    .{ .vpermilpd, .rmi, &.{ .ymm, .ymm_m256, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x05 }, 0, .vex_256_w0, .avx },
+
+    .{ .vpermilpd, .rvm, &.{ .xmm, .xmm,      .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0c }, 0, .vex_128_w0, .avx },
+    .{ .vpermilps, .rmi, &.{ .xmm, .xmm_m128, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x04 }, 0, .vex_128_w0, .avx },
+    .{ .vpermilps, .rvm, &.{ .ymm, .ymm,      .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0c }, 0, .vex_256_w0, .avx },
+    .{ .vpermilps, .rmi, &.{ .ymm, .ymm_m256, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x04 }, 0, .vex_256_w0, .avx },
+
    .{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx },
    .{ .vpextrd, .mri, &.{ .rm32,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx },
    .{ .vpextrq, .mri, &.{ .rm64,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx },
@ -2418,6 +2451,25 @@ pub const table = [_]Entry{

    .{ .vpcmpgtq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_256_wig, .avx2 },

+    .{ .vperm2i128, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x46 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpermd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x36 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpermpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x01 }, 0, .vex_256_w1, .avx2 },
+
+    .{ .vpermps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x16 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpermq, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x00 }, 0, .vex_256_w1, .avx2 },
+
+    .{ .vpmaskmovd, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w0, .avx2 },
+    .{ .vpmaskmovd, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w0, .avx2 },
+    .{ .vpmaskmovq, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w1, .avx2 },
+    .{ .vpmaskmovq, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w1, .avx2 },
+    .{ .vpmaskmovd, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_128_w0, .avx2 },
+    .{ .vpmaskmovd, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_256_w0, .avx2 },
+    .{ .vpmaskmovq, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_128_w1, .avx2 },
+    .{ .vpmaskmovq, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_256_w1, .avx2 },
+
    .{ .vpmaxsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_256_wig, .avx2 },
    .{ .vpmaxsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f,       0xee }, 0, .vex_256_wig, .avx2 },
    .{ .vpmaxsd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_256_wig, .avx2 },
@ -2477,11 +2529,19 @@ pub const table = [_]Entry{

    .{ .vpslldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 7, .vex_256_wig, .avx2 },

+    .{ .vpsllvd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpsllvq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_128_w1, .avx2 },
+    .{ .vpsllvd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpsllvq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_256_w1, .avx2 },
+
    .{ .vpsraw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe1 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsraw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 4, .vex_256_wig, .avx2 },
    .{ .vpsrad, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe2 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsrad, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 4, .vex_256_wig, .avx2 },

+    .{ .vpsravd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x46 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpsravd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x46 }, 0, .vex_256_w0, .avx2 },
+
    .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 },
    .{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 },
@ -2489,7 +2549,12 @@ pub const table = [_]Entry{
    .{ .vpsrlq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsrlq, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_256_wig, .avx2 },

-    .{ .vpsrldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_128_wig, .avx2 },
+    .{ .vpsrldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_256_wig, .avx2 },
+
+    .{ .vpsrlvd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpsrlvq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_128_w1, .avx2 },
+    .{ .vpsrlvd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpsrlvq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_256_w1, .avx2 },

    .{ .vpsubb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsubw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_256_wig, .avx2 },
--- a/test/behavior/x86_64/math.zig
+++ b/test/behavior/x86_64/math.zig
@ -55,6 +55,17 @@ fn DoubleBits(comptime Type: type) type {
        .vector => |vector| @Vector(vector.len, ResultScalar),
    };
 }
+fn RoundBitsUp(comptime Type: type, comptime multiple: u16) type {
+    const ResultScalar = switch (@typeInfo(Scalar(Type))) {
+        .int => |int| @Type(.{ .int = .{ .signedness = int.signedness, .bits = std.mem.alignForward(u16, int.bits, multiple) } }),
+        .float => Scalar(Type),
+        else => @compileError(@typeName(Type)),
+    };
+    return switch (@typeInfo(Type)) {
+        else => ResultScalar,
+        .vector => |vector| @Vector(vector.len, ResultScalar),
+    };
+}
 // inline to avoid a runtime `@splat`
 inline fn splat(comptime Type: type, scalar: Scalar(Type)) Type {
    return switch (@typeInfo(Type)) {
@ -17962,6 +17973,78 @@ fn binary(comptime op: anytype, comptime opts: struct { compare: Compare = .rela
            try testArgs(f128, nan(f128), nan(f128));
        }
        fn testIntVectors() !void {
+            try testArgs(@Vector(1, i4), .{
+                0x1,
+            }, .{
+                0x3,
+            });
+            try testArgs(@Vector(2, i4), .{
+                -0x1, 0x7,
+            }, .{
+                -0x7, 0x6,
+            });
+            try testArgs(@Vector(4, i4), .{
+                -0x1, 0x2, -0x3, -0x6,
+            }, .{
+                -0x2, -0x6, -0x4, 0x1,
+            });
+            try testArgs(@Vector(8, i4), .{
+                -0x4, 0x6, -0x4, -0x1, -0x1, 0x6, 0x5, 0x2,
+            }, .{
+                0x2, 0x4, -0x3, -0x6, 0x1, -0x5, -0x1, 0x2,
+            });
+            // workaround https://github.com/ziglang/zig/issues/22914
+            // TODO: try testArgs(@Vector(16, i4), .{
+            //     0x4, 0x1, -0x7, -0x2, -0x7, 0x4, -0x4, -0x8, -0x1, 0x0, -0x8, 0x5, -0x5, 0x3, 0x3, 0x2,
+            // }, .{
+            //     0x7, -0x7, -0x6, -0x1, 0x3, -0x5, -0x3, -0x6, 0x4, 0x4, -0x2, 0x7, -0x2, 0x6, -0x4, -0x1,
+            // });
+            try testArgs(@Vector(16, i4), .{
+                0x7, -0x7, -0x6, -0x1, 0x3, -0x5, -0x3, -0x6, 0x4, 0x4, -0x2, 0x7, -0x2, 0x6, -0x4, -0x1,
+            }, .{
+                0x4, 0x1, -0x7, -0x2, -0x7, 0x4, -0x4, -0x8, -0x1, 0x1, -0x8, 0x5, -0x5, 0x3, 0x3, 0x2,
+            });
+            try testArgs(@Vector(32, i4), .{
+                0x0, 0x4,  0x0,  -0x6, -0x7, 0x4, -0x3, 0x4, -0x5, 0x2,  0x3,  0x2,  -0x6, -0x4, -0x4, -0x3,
+                0x7, -0x5, -0x3, 0x2,  -0x4, 0x4, -0x1, 0x6, -0x7, -0x1, -0x6, -0x2, -0x4, -0x2, 0x5,  0x0,
+            }, .{
+                0x5,  0x1, 0x5, 0x7, 0x1, -0x3, 0x3,  0x3, 0x5, 0x4,  0x1, 0x5, 0x4,  -0x8, -0x3, -0x6,
+                -0x2, 0x3, 0x1, 0x2, 0x4, 0x4,  -0x8, 0x2, 0x6, -0x1, 0x1, 0x3, -0x1, -0x3, 0x7,  -0x7,
+            });
+
+            try testArgs(@Vector(1, u4), .{
+                0xe,
+            }, .{
+                0xc,
+            });
+            try testArgs(@Vector(2, u4), .{
+                0x2, 0x5,
+            }, .{
+                0x9, 0xe,
+            });
+            try testArgs(@Vector(4, u4), .{
+                0x2, 0xb, 0xc, 0x7,
+            }, .{
+                0x2, 0xa, 0x8, 0x1,
+            });
+            try testArgs(@Vector(8, u4), .{
+                0xf, 0x9, 0x0, 0x6, 0x8, 0x7, 0xd, 0x7,
+            }, .{
+                0xb, 0xb, 0x3, 0x6, 0x1, 0x5, 0x4, 0xd,
+            });
+            try testArgs(@Vector(16, u4), .{
+                0x5, 0x1, 0xa, 0x6, 0xb, 0x3, 0x0, 0x7, 0x8, 0x0, 0x9, 0xe, 0x2, 0x9, 0x2, 0x5,
+            }, .{
+                0x4, 0x9, 0x4, 0x8, 0x5, 0x7, 0xf, 0x8, 0x3, 0xc, 0x6, 0x9, 0xd, 0xd, 0x2, 0xd,
+            });
+            try testArgs(@Vector(32, u4), .{
+                0xa, 0x5, 0xd, 0x4, 0xe, 0xf, 0xf, 0x2, 0xb, 0x3, 0x9, 0x2, 0x1, 0x9, 0x6, 0x8,
+                0x7, 0xc, 0x3, 0x5, 0x4, 0xb, 0x5, 0x4, 0x8, 0x2, 0x5, 0x9, 0xf, 0x6, 0x7, 0x7,
+            }, .{
+                0xb, 0xf, 0xf, 0xf, 0xb, 0xf, 0xd, 0xc, 0x1, 0xa, 0x1, 0xd, 0x7, 0x4, 0x4, 0x8,
+                0x2, 0xb, 0xb, 0x4, 0xa, 0x7, 0x6, 0xd, 0xb, 0xb, 0x6, 0xb, 0x1, 0x8, 0xa, 0x6,
+            });
+
            try testArgs(@Vector(1, i8), .{
                -0x54,
            }, .{
@ -19013,6 +19096,7 @@ inline fn mulUnsafe(comptime Type: type, lhs: Type, rhs: Type) DoubleBits(Type)
 test mulUnsafe {
    const test_mul_unsafe = binary(mulUnsafe, .{});
    try test_mul_unsafe.testInts();
+    try test_mul_unsafe.testIntVectors();
 }

 inline fn multiply(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs * rhs) {
@ -19189,6 +19273,14 @@ test clz {
    try test_clz.testIntVectors();
 }

+inline fn byteSwap(comptime Type: type, rhs: Type) RoundBitsUp(Type, 8) {
+    return @byteSwap(@as(RoundBitsUp(Type, 8), rhs));
+}
+test byteSwap {
+    const test_byte_swap = unary(byteSwap, .{});
+    try test_byte_swap.testInts();
+}
+
 inline fn sqrt(comptime Type: type, rhs: Type) @TypeOf(@sqrt(rhs)) {
    return @sqrt(rhs);
 }