From ab78e489fe98c5200daabeadbd517ece3f99b2bb Mon Sep 17 00:00:00 2001 From: Brent Graveland Date: Thu, 20 Nov 2025 15:17:47 +0100 Subject: [PATCH] Fix ublk uring usage due to SQE32 vs 128 When IORING_SETUP_SQE128 is set, the kernel uses 128-byte spacing instead of 64, and with IORING_SETUP_CQE32 it uses 32-byte CQE spacing instead of 16 bytes. This was causing problems when attempting to use zig for a linux ublk server. --- lib/std/os/linux/IoUring.zig | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/lib/std/os/linux/IoUring.zig b/lib/std/os/linux/IoUring.zig index f5f68a0ebe..40a6ac55ae 100644 --- a/lib/std/os/linux/IoUring.zig +++ b/lib/std/os/linux/IoUring.zig @@ -139,7 +139,8 @@ pub fn get_sqe(self: *IoUring) !*linux.io_uring_sqe { // We must therefore use wrapping addition and subtraction to avoid a runtime crash. const next = self.sq.sqe_tail +% 1; if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull; - const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask]; + // Use shift-based indexing for SQE128 to match liburing + const sqe = &self.sq.sqes[(self.sq.sqe_tail & self.sq.mask) << self.sq.sqe_shift]; self.sq.sqe_tail = next; return sqe; } @@ -284,6 +285,14 @@ fn copy_cqes_ready(self: *IoUring, cqes: []linux.io_uring_cqe) u32 { const count = @min(cqes.len, ready); const head = self.cq.head.* & self.cq.mask; + // Copy CQEs using shift-based indexing to match liburing + // When CQE32 is enabled, shift=1 doubles the index: cqes[(head & mask) << shift] + for (0..count) |i| { + const cqe_index = (head + @as(u32, @intCast(i))) & self.cq.mask; + const array_index = cqe_index << self.cq.cqe_shift; + cqes[i] = self.cq.cqes[array_index]; + } + // before wrapping const n = @min(self.cq.cqes.len - head, count); @memcpy(cqes[0..n], self.cq.cqes[head..][0..n]); @@ -1512,6 +1521,7 @@ pub const SubmissionQueue = struct { // This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs. sqe_head: u32 = 0, sqe_tail: u32 = 0, + sqe_shift: u1, // Index shift for SQE128: 0 for normal, 1 for SQE128 (doubles index) pub fn init(fd: linux.fd_t, p: linux.io_uring_params) !SubmissionQueue { assert(fd >= 0); @@ -1533,7 +1543,12 @@ pub const SubmissionQueue = struct { // The motivation for the `sqes` and `array` indirection is to make it possible for the // application to preallocate static linux.io_uring_sqe entries and then replay them when needed. - const size_sqes = p.sq_entries * @sizeOf(linux.io_uring_sqe); + // With SQE128, each SQE is 128 bytes instead of 64 (kernel uses double spacing) + const sqe_size: usize = if ((p.flags & linux.IORING_SETUP_SQE128) != 0) + @sizeOf(linux.io_uring_sqe) * 2 + else + @sizeOf(linux.io_uring_sqe); + const size_sqes = p.sq_entries * sqe_size; const mmap_sqes = try posix.mmap( null, size_sqes, @@ -1545,6 +1560,10 @@ pub const SubmissionQueue = struct { errdefer posix.munmap(mmap_sqes); assert(mmap_sqes.len == size_sqes); + // Determine SQE shift: 1 if SQE128 is set (doubles array index for 128-byte stride) + // Matches liburing: sqe = &sqes[(sqe_tail & mask) << shift] + const sqe_shift: u1 = if ((p.flags & linux.IORING_SETUP_SQE128) != 0) 1 else 0; + const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array])); const sqes: [*]linux.io_uring_sqe = @ptrCast(@alignCast(&mmap_sqes[0])); // We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries, @@ -1560,6 +1579,7 @@ pub const SubmissionQueue = struct { .sqes = sqes[0..p.sq_entries], .mmap = mmap, .mmap_sqes = mmap_sqes, + .sqe_shift = sqe_shift, }; } @@ -1575,6 +1595,7 @@ pub const CompletionQueue = struct { mask: u32, overflow: *u32, cqes: []linux.io_uring_cqe, + cqe_shift: u1, // Index shift for CQE32: 0 for normal, 1 for CQE32 (doubles index) pub fn init(fd: linux.fd_t, p: linux.io_uring_params, sq: SubmissionQueue) !CompletionQueue { assert(fd >= 0); @@ -1582,12 +1603,18 @@ pub const CompletionQueue = struct { const mmap = sq.mmap; const cqes: [*]linux.io_uring_cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes])); assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*); + + // Determine CQE shift: 1 if CQE32 flag is set (doubles array index for 32-byte stride) + // Matches liburing's implementation: cqe = &cqes[(head & mask) << shift] + const cqe_shift: u1 = if ((p.flags & linux.IORING_SETUP_CQE32) != 0) 1 else 0; + return CompletionQueue{ .head = @ptrCast(@alignCast(&mmap[p.cq_off.head])), .tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])), .mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*, .overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])), .cqes = cqes[0..p.cq_entries], + .cqe_shift = cqe_shift, }; }