new builtins: @workItemId, @workGroupId, @workGroupSize

* @workItemId returns the index of the work item in a work group for a dimension. * @workGroupId returns the index of the work group in the kernel dispatch for a dimension. * @workGroupSize returns the size of the work group for a dimension. These builtins are mainly useful for GPU backends. They are currently only implemented for the AMDGCN LLVM backend.
2025-12-06 13:54:21 +00:00 · 2023-03-18 15:59:56 +01:00 · 2023-03-18 15:59:56 +01:00 · 3357c59ceb
commit 3357c59ceb
parent 83051b0cbf
17 changed files with 269 additions and 0 deletions
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@ -9578,6 +9578,28 @@ fn foo(comptime T: type, ptr: *T) T {
      Remove {#syntax#}volatile{#endsyntax#} qualifier from a pointer.
      </p>
      {#header_close#}
+
+      {#header_open|@workGroupId#}
+      <pre>{#syntax#}@workGroupId(comptime dimension: u32) u32{#endsyntax#}</pre>
+      <p>
+      Returns the index of the work group in the current kernel invocation in dimension {#syntax#}dimension{#endsyntax#}.
+      </p>
+      {#header_close#}
+
+      {#header_open|@workGroupSize#}
+      <pre>{#syntax#}@workGroupSize(comptime dimension: u32) u32{#endsyntax#}</pre>
+      <p>
+      Returns the number of work items that a work group has in dimension {#syntax#}dimension{#endsyntax#}.
+      </p>
+      {#header_close#}
+
+      {#header_open|@workItemId#}
+      <pre>{#syntax#}@workItemId(comptime dimension: u32) u32{#endsyntax#}</pre>
+      <p>
+      Returns the index of the work item in the work group in dimension {#syntax#}dimension{#endsyntax#}. This function returns values between {#syntax#}0{#endsyntax#} (inclusive) and {#syntax#}@workGroupSize(dimension){#endsyntax#} (exclusive).
+      </p>
+      {#header_close#}
+
      {#header_close#}

      {#header_open|Build Mode#}
--- a/src/Air.zig
+++ b/src/Air.zig
@ -761,6 +761,22 @@ pub const Inst = struct {
        /// Uses the `ty` field.
        c_va_start,

+        /// Implements @workItemId builtin.
+        /// Result type is always `u32`
+        /// Uses the `pl_op` field, payload is the dimension to get the work item id for.
+        /// Operand is unused and set to Ref.none
+        work_item_id,
+        /// Implements @workGroupSize builtin.
+        /// Result type is always `u32`
+        /// Uses the `pl_op` field, payload is the dimension to get the work group size for.
+        /// Operand is unused and set to Ref.none
+        work_group_size,
+        /// Implements @workGroupId builtin.
+        /// Result type is always `u32`
+        /// Uses the `pl_op` field, payload is the dimension to get the work group id for.
+        /// Operand is unused and set to Ref.none
+        work_group_id,
+
        pub fn fromCmpOp(op: std.math.CompareOperator, optimized: bool) Tag {
            switch (op) {
                .lt => return if (optimized) .cmp_lt_optimized else .cmp_lt,
@ -1267,6 +1283,11 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index) Type {
            const err_union_ty = air.typeOf(datas[inst].pl_op.operand);
            return err_union_ty.errorUnionPayload();
        },
+
+        .work_item_id,
+        .work_group_size,
+        .work_group_id,
+        => return Type.u32,
    }
 }

--- a/src/AstGen.zig
+++ b/src/AstGen.zig
@ -8549,6 +8549,40 @@ fn builtinCall(
            }
            return rvalue(gz, ri, try gz.addNodeExtended(.c_va_start, node), node);
        },
+
+        .work_item_id => {
+            if (astgen.fn_block == null) {
+                return astgen.failNode(node, "'@workItemId' outside function scope", .{});
+            }
+            const operand = try comptimeExpr(gz, scope, .{ .rl = .{ .coerced_ty = .u32_type } }, params[0]);
+            const result = try gz.addExtendedPayload(.work_item_id, Zir.Inst.UnNode{
+                .node = gz.nodeIndexToRelative(node),
+                .operand = operand,
+            });
+            return rvalue(gz, ri, result, node);
+        },
+        .work_group_size => {
+            if (astgen.fn_block == null) {
+                return astgen.failNode(node, "'@workGroupSize' outside function scope", .{});
+            }
+            const operand = try comptimeExpr(gz, scope, .{ .rl = .{ .coerced_ty = .u32_type } }, params[0]);
+            const result = try gz.addExtendedPayload(.work_group_size, Zir.Inst.UnNode{
+                .node = gz.nodeIndexToRelative(node),
+                .operand = operand,
+            });
+            return rvalue(gz, ri, result, node);
+        },
+        .work_group_id => {
+            if (astgen.fn_block == null) {
+                return astgen.failNode(node, "'@workGroupId' outside function scope", .{});
+            }
+            const operand = try comptimeExpr(gz, scope, .{ .rl = .{ .coerced_ty = .u32_type } }, params[0]);
+            const result = try gz.addExtendedPayload(.work_group_id, Zir.Inst.UnNode{
+                .node = gz.nodeIndexToRelative(node),
+                .operand = operand,
+            });
+            return rvalue(gz, ri, result, node);
+        },
    }
 }

--- a/src/BuiltinFn.zig
+++ b/src/BuiltinFn.zig
@ -118,6 +118,9 @@ pub const Tag = enum {
    union_init,
    Vector,
    volatile_cast,
+    work_item_id,
+    work_group_size,
+    work_group_id,
 };

 pub const MemLocRequirement = enum {
@ -980,5 +983,25 @@ pub const list = list: {
                .param_count = 1,
            },
        },
+        .{
+            "@workItemId", .{
+                .tag = .work_item_id,
+                .param_count = 1,
+            },
+        },
+        .{
+            "@workGroupSize",
+            .{
+                .tag = .work_group_size,
+                .param_count = 1,
+            },
+        },
+        .{
+            "@workGroupId",
+            .{
+                .tag = .work_group_id,
+                .param_count = 1,
+            },
+        },
    });
 };
--- a/src/Liveness.zig
+++ b/src/Liveness.zig
@ -240,6 +240,9 @@ pub fn categorizeOperand(
        .err_return_trace,
        .save_err_return_trace_index,
        .c_va_start,
+        .work_item_id,
+        .work_group_size,
+        .work_group_id,
        => return .none,

        .fence => return .write,
@ -864,6 +867,9 @@ fn analyzeInst(
        .err_return_trace,
        .save_err_return_trace_index,
        .c_va_start,
+        .work_item_id,
+        .work_group_size,
+        .work_group_id,
        => return trackOperands(a, new_set, inst, main_tomb, .{ .none, .none, .none }),

        .not,
--- a/src/Sema.zig
+++ b/src/Sema.zig
@ -1164,6 +1164,9 @@ fn analyzeBodyInner(
                    .c_va_start            => try sema.zirCVaStart(          block, extended),
                    .const_cast,           => try sema.zirConstCast(         block, extended),
                    .volatile_cast,        => try sema.zirVolatileCast(      block, extended),
+                    .work_item_id          => try sema.zirWorkItem(          block, extended, extended.opcode),
+                    .work_group_size       => try sema.zirWorkItem(          block, extended, extended.opcode),
+                    .work_group_id         => try sema.zirWorkItem(          block, extended, extended.opcode),
                    // zig fmt: on

                    .fence => {
@ -22437,6 +22440,42 @@ fn zirBuiltinExtern(
    return sema.addConstant(ty, ref);
 }

+fn zirWorkItem(
+    sema: *Sema,
+    block: *Block,
+    extended: Zir.Inst.Extended.InstData,
+    zir_tag: Zir.Inst.Extended,
+) CompileError!Air.Inst.Ref {
+    const extra = sema.code.extraData(Zir.Inst.UnNode, extended.operand).data;
+    const dimension_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = extra.node };
+    const builtin_src = LazySrcLoc.nodeOffset(extra.node);
+    const target = sema.mod.getTarget();
+
+    switch (target.cpu.arch) {
+        // TODO: Allow for other GPU targets.
+        .amdgcn => {},
+        else => {
+            return sema.fail(block, builtin_src, "builtin only available on GPU targets; targeted architecture is {s}", .{@tagName(target.cpu.arch)});
+        },
+    }
+
+    const dimension = @intCast(u32, try sema.resolveInt(block, dimension_src, extra.operand, Type.u32, "dimension must be comptime-known"));
+    try sema.requireRuntimeBlock(block, builtin_src, null);
+
+    return block.addInst(.{
+        .tag = switch (zir_tag) {
+            .work_item_id => .work_item_id,
+            .work_group_size => .work_group_size,
+            .work_group_id => .work_group_id,
+            else => unreachable,
+        },
+        .data = .{ .pl_op = .{
+            .operand = .none,
+            .payload = dimension,
+        } },
+    });
+}
+
 fn requireRuntimeBlock(sema: *Sema, block: *Block, src: LazySrcLoc, runtime_src: ?LazySrcLoc) !void {
    if (block.is_comptime) {
        const msg = msg: {
--- a/src/Zir.zig
+++ b/src/Zir.zig
@ -2032,6 +2032,15 @@ pub const Inst = struct {
        /// Implements the `@volatileCast` builtin.
        /// `operand` is payload index to `UnNode`.
        volatile_cast,
+        /// Implements the `@workItemId` builtin.
+        /// `operand` is payload index to `UnNode`.
+        work_item_id,
+        /// Implements the `@workGroupSize` builtin.
+        /// `operand` is payload index to `UnNode`.
+        work_group_size,
+        /// Implements the `@workGroupId` builtin.
+        /// `operand` is payload index to `UnNode`.
+        work_group_id,

        pub const InstData = struct {
            opcode: Extended,
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@ -890,6 +890,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {

            .wasm_memory_size => unreachable,
            .wasm_memory_grow => unreachable,
+
+            .work_item_id => unreachable,
+            .work_group_size => unreachable,
+            .work_group_id => unreachable,
            // zig fmt: on
        }

--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@ -874,6 +874,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {

            .wasm_memory_size => unreachable,
            .wasm_memory_grow => unreachable,
+
+            .work_item_id => unreachable,
+            .work_group_size => unreachable,
+            .work_group_id => unreachable,
            // zig fmt: on
        }

--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@ -704,6 +704,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {

            .wasm_memory_size => unreachable,
            .wasm_memory_grow => unreachable,
+
+            .work_item_id => unreachable,
+            .work_group_size => unreachable,
+            .work_group_id => unreachable,
            // zig fmt: on
        }
        if (std.debug.runtime_safety) {
--- a/src/arch/sparc64/CodeGen.zig
+++ b/src/arch/sparc64/CodeGen.zig
@ -720,6 +720,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {

            .wasm_memory_size => unreachable,
            .wasm_memory_grow => unreachable,
+
+            .work_item_id => unreachable,
+            .work_group_size => unreachable,
+            .work_group_id => unreachable,
            // zig fmt: on
        }

--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@ -1997,6 +1997,11 @@ fn genInst(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
        .reduce_optimized,
        .float_to_int_optimized,
        => return func.fail("TODO implement optimized float mode", .{}),
+
+        .work_item_id,
+        .work_group_size,
+        .work_group_id,
+        => unreachable,
    };
 }

--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@ -1132,6 +1132,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {

            .wasm_memory_size => unreachable,
            .wasm_memory_grow => unreachable,
+
+            .work_item_id => unreachable,
+            .work_group_size => unreachable,
+            .work_group_id => unreachable,
            // zig fmt: on
        }

--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@ -2995,6 +2995,11 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail,
            .c_va_arg => try airCVaArg(f, inst),
            .c_va_end => try airCVaEnd(f, inst),
            .c_va_copy => try airCVaCopy(f, inst),
+
+            .work_item_id,
+            .work_group_size,
+            .work_group_id,
+            => unreachable,
            // zig fmt: on
        };
        if (result_value == .new_local) {
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@ -4745,6 +4745,10 @@ pub const FuncGen = struct {
                .c_va_copy => try self.airCVaCopy(inst),
                .c_va_end => try self.airCVaEnd(inst),
                .c_va_start => try self.airCVaStart(inst),
+
+                .work_item_id => try self.airWorkItemId(inst),
+                .work_group_size => try self.airWorkGroupSize(inst),
+                .work_group_id => try self.airWorkGroupId(inst),
                // zig fmt: on
            };
            if (opt_value) |val| {
@ -9567,6 +9571,74 @@ pub const FuncGen = struct {
        return self.builder.buildAddrSpaceCast(operand, llvm_dest_ty, "");
    }

+    fn amdgcnWorkIntrinsic(self: *FuncGen, dimension: u32, default: u32, comptime basename: []const u8) !?*llvm.Value {
+        const llvm_u32 = self.context.intType(32);
+
+        const llvm_fn_name = switch (dimension) {
+            0 => basename ++ ".x",
+            1 => basename ++ ".y",
+            2 => basename ++ ".z",
+            else => return llvm_u32.constInt(default, .False),
+        };
+
+        const args: [0]*llvm.Value = .{};
+        const llvm_fn = self.getIntrinsic(llvm_fn_name, &.{});
+        return self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, &args, args.len, .Fast, .Auto, "");
+    }
+
+    fn airWorkItemId(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
+
+        const target = self.dg.module.getTarget();
+        assert(target.cpu.arch == .amdgcn); // TODO is to port this function to other GPU architectures
+
+        const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+        const dimension = pl_op.payload;
+        return self.amdgcnWorkIntrinsic(dimension, 0, "llvm.amdgcn.workitem.id");
+    }
+
+    fn airWorkGroupSize(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
+
+        const target = self.dg.module.getTarget();
+        assert(target.cpu.arch == .amdgcn); // TODO is to port this function to other GPU architectures
+
+        const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+        const dimension = pl_op.payload;
+        const llvm_u32 = self.context.intType(32);
+        if (dimension >= 3) {
+            return llvm_u32.constInt(1, .False);
+        }
+
+        // Fetch the dispatch pointer, which points to this structure:
+        // https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/adae6c61e10d371f7cbc3d0e94ae2c070cab18a4/src/inc/hsa.h#L2913
+        const llvm_fn = self.getIntrinsic("llvm.amdgcn.dispatch.ptr", &.{});
+        const args: [0]*llvm.Value = .{};
+        const dispatch_ptr = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, &args, args.len, .Fast, .Auto, "");
+        dispatch_ptr.setAlignment(4);
+
+        // Load the work_group_* member from the struct as u16.
+        // Just treat the dispatch pointer as an array of u16 to keep things simple.
+        const offset = 2 + dimension;
+        const index = [_]*llvm.Value{llvm_u32.constInt(offset, .False)};
+        const llvm_u16 = self.context.intType(16);
+        const workgroup_size_ptr = self.builder.buildInBoundsGEP(llvm_u16, dispatch_ptr, &index, index.len, "");
+        const workgroup_size = self.builder.buildLoad(llvm_u16, workgroup_size_ptr, "");
+        workgroup_size.setAlignment(2);
+        return workgroup_size;
+    }
+
+    fn airWorkGroupId(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
+
+        const target = self.dg.module.getTarget();
+        assert(target.cpu.arch == .amdgcn); // TODO is to port this function to other GPU architectures
+
+        const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+        const dimension = pl_op.payload;
+        return self.amdgcnWorkIntrinsic(dimension, 0, "llvm.amdgcn.workgroup.id");
+    }
+
    fn getErrorNameTable(self: *FuncGen) !*llvm.Value {
        if (self.dg.object.error_name_table) |table| {
            return table;
--- a/src/print_air.zig
+++ b/src/print_air.zig
@ -328,6 +328,11 @@ const Writer = struct {
            .vector_store_elem => try w.writeVectorStoreElem(s, inst),

            .dbg_block_begin, .dbg_block_end => {},
+
+            .work_item_id,
+            .work_group_size,
+            .work_group_id,
+            => try w.writeWorkDimension(s, inst),
        }
        try s.writeAll(")\n");
    }
@ -869,6 +874,11 @@ const Writer = struct {
        try w.writeOperand(s, inst, 0, pl_op.operand);
    }

+    fn writeWorkDimension(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
+        const pl_op = w.air.instructions.items(.data)[inst].pl_op;
+        try s.print("{d}", .{pl_op.payload});
+    }
+
    fn writeOperand(
        w: *Writer,
        s: anytype,
--- a/src/print_zir.zig
+++ b/src/print_zir.zig
@ -512,6 +512,9 @@ const Writer = struct {
            .c_va_end,
            .const_cast,
            .volatile_cast,
+            .work_item_id,
+            .work_group_size,
+            .work_group_id,
            => {
                const inst_data = self.code.extraData(Zir.Inst.UnNode, extended.operand).data;
                const src = LazySrcLoc.nodeOffset(inst_data.node);