mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 13:54:21 +00:00
Improve naming and namespacing of IoUring flags and types to match Zig Style move some flags from uflags to its type Add matches doc comments to make it easy to match Zig IoUring types to liburing Signed-off-by: Bernard Assan <mega.alpha100@gmail.com>
6317 lines
216 KiB
Zig
6317 lines
216 KiB
Zig
const IoUring = @This();
|
|
const std = @import("std");
|
|
const builtin = @import("builtin");
|
|
const assert = std.debug.assert;
|
|
const mem = std.mem;
|
|
const net = std.Io.net;
|
|
const posix = std.posix;
|
|
const linux = std.os.linux;
|
|
const testing = std.testing;
|
|
const is_linux = builtin.os.tag == .linux;
|
|
const page_size_min = std.heap.page_size_min;
|
|
|
|
fd: posix.fd_t = -1,
|
|
sq: Sq,
|
|
cq: Cq,
|
|
flags: uflags.Setup,
|
|
features: uflags.Features,
|
|
|
|
// COMMIT: move IoUring constants to Constants
|
|
pub const constants = struct {
|
|
/// If sqe.file_index (splice_fd_in in Zig Struct) is set to this for opcodes that instantiate a new
|
|
/// an available direct descriptor instead of having the application pass one
|
|
/// direct descriptor (like openat/openat2/accept), then io_uring will allocate
|
|
/// in. The picked direct descriptor will be returned in cqe.res, or -ENFILE
|
|
/// if the space is full.
|
|
pub const FILE_INDEX_ALLOC = std.math.maxInt(u32);
|
|
|
|
pub const CMD_MASK = 1 << 0;
|
|
|
|
pub const TIMEOUT_CLOCK_MASK = ((1 << 2) | (1 << 3));
|
|
pub const TIMEOUT_UPDATE_MASK = ((1 << 1) | (1 << 4));
|
|
|
|
pub const CQE_BUFFER_SHIFT = 16;
|
|
|
|
/// cqe.res for IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was
|
|
/// requested It should be treated as a flag, all other bits of cqe.res
|
|
/// should be treated as reserved!
|
|
pub const NOTIF_USAGE_ZC_COPIED = (1 << 31);
|
|
|
|
//Magic offsets for the application to mmap the data it needs
|
|
pub const OFF_SQ_RING = 0;
|
|
pub const OFF_CQ_RING = 0x8000000;
|
|
pub const OFF_SQES = 0x10000000;
|
|
// COMMIT: new magic constants
|
|
pub const OFF_PBUF_RING = 0x80000000;
|
|
pub const OFF_PBUF_SHIFT = 16;
|
|
pub const OFF_MMAP_MASK = 0xf8000000;
|
|
|
|
/// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors.
|
|
pub const RSRC_REGISTER_SPARSE = 1 << 0;
|
|
|
|
/// Skip updating fd indexes set to this value in the fd table
|
|
pub const REGISTER_FILES_SKIP = -2;
|
|
|
|
// COMMIT: new TX Timestamp definition
|
|
/// SOCKET_URING_OP_TX_TIMESTAMP definitions
|
|
pub const TIMESTAMP_HW_SHIFT = 16;
|
|
/// The cqe.flags bit from which the timestamp type is stored
|
|
pub const TIMESTAMP_TYPE_SHIFT = (TIMESTAMP_HW_SHIFT + 1);
|
|
/// The cqe.flags flag signifying whether it's a hardware timestamp
|
|
pub const CQE_F_TSTAMP_HW = (1 << TIMESTAMP_HW_SHIFT);
|
|
|
|
/// The bit from which area id is encoded into offsets
|
|
pub const ZCRX_AREA_SHIFT = 48;
|
|
pub const ZCRX_AREA_MASK = (~((1 << ZCRX_AREA_SHIFT) - 1));
|
|
|
|
// flag added to the opcode to use a registered ring fd
|
|
pub const REGISTER_USE_REGISTERED_RING = 1 << 31;
|
|
};
|
|
|
|
// COMMIT: move IoUring flags to Flags struct
|
|
pub const uflags = struct {
|
|
/// io_uring_setup() flags
|
|
pub const Setup = packed struct(u32) {
|
|
/// io_context is polled
|
|
IOPOLL: bool = false,
|
|
/// SQ poll thread
|
|
SQPOLL: bool = false,
|
|
/// sq_thread_cpu is valid
|
|
SQ_AFF: bool = false,
|
|
/// app defines CQ size
|
|
CQSIZE: bool = false,
|
|
/// clamp SQ/CQ ring sizes
|
|
CLAMP: bool = false,
|
|
/// attach to existing wq
|
|
ATTACH_WQ: bool = false,
|
|
/// start with ring disabled
|
|
R_DISABLED: bool = false,
|
|
/// continue submit on error
|
|
SUBMIT_ALL: bool = false,
|
|
///Cooperative task running. When requests complete, they often require
|
|
///forcing the submitter to transition to the kernel to complete. If this
|
|
///flag is set, work will be done when the task transitions anyway, rather
|
|
///than force an inter-processor interrupt reschedule. This avoids interrupting
|
|
///a task running in userspace, and saves an IPI.
|
|
COOP_TASKRUN: bool = false,
|
|
///If COOP_TASKRUN is set, get notified if task work is available for
|
|
///running and a kernel transition would be needed to run it. This sets
|
|
///IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
|
|
TASKRUN_FLAG: bool = false,
|
|
/// SQEs are 128 byte
|
|
SQE128: bool = false,
|
|
/// CQEs are 32 byte
|
|
CQE32: bool = false,
|
|
/// Only one task is allowed to submit requests
|
|
SINGLE_ISSUER: bool = false,
|
|
/// Defer running task work to get events.
|
|
/// Rather than running bits of task work whenever the task transitions
|
|
/// try to do it just before it is needed.
|
|
DEFER_TASKRUN: bool = false,
|
|
/// Application provides the memory for the rings
|
|
NO_MMAP: bool = false,
|
|
/// Register the ring fd in itself for use with
|
|
/// IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
|
|
/// than an fd.
|
|
REGISTERED_FD_ONLY: bool = false,
|
|
/// Removes indirection through the SQ index array.
|
|
NO_SQARRAY: bool = false,
|
|
// COMMIT: new setup flags
|
|
/// Use hybrid poll in iopoll process
|
|
HYBRID_IOPOLL: bool = false,
|
|
/// Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have
|
|
/// IORING_CQE_F_32 set in cqe.flags.
|
|
CQE_MIXED: bool = false,
|
|
_unused: u13 = 0,
|
|
};
|
|
|
|
/// sqe.uring_cmd_flags (rw_flags in the Zig struct)
|
|
/// top 8bits aren't available for userspace
|
|
/// use registered buffer; pass this flag along with setting sqe.buf_index.
|
|
pub const Cmd = packed struct(u32) {
|
|
CMD_FIXED: bool = false,
|
|
_unused: u31 = 0,
|
|
};
|
|
|
|
/// sqe.fsync_flags (rw_flags in the Zig struct)
|
|
pub const Fsync = packed struct(u32) {
|
|
DATASYNC: bool = false,
|
|
_unused: u31 = 0,
|
|
};
|
|
|
|
/// sqe.timeout_flags
|
|
pub const Timeout = packed struct(u32) {
|
|
TIMEOUT_ABS: bool = false,
|
|
/// Available since Linux 5.11
|
|
TIMEOUT_UPDATE: bool = false,
|
|
/// Available since Linux 5.15
|
|
TIMEOUT_BOOTTIME: bool = false,
|
|
/// Available since Linux 5.15
|
|
TIMEOUT_REALTIME: bool = false,
|
|
/// Available since Linux 5.15
|
|
LINK_TIMEOUT_UPDATE: bool = false,
|
|
/// Available since Linux 5.16
|
|
TIMEOUT_ETIME_SUCCESS: bool = false,
|
|
// COMMIT: new Timeout Flag
|
|
// TODO: add when it became available
|
|
TIMEOUT_MULTISHOT: bool = false,
|
|
_unused: u25 = 0,
|
|
};
|
|
|
|
/// sqe.splice_flags (rw_flags in Zig Struct)
|
|
/// extends splice(2) flags
|
|
pub const Splice = packed struct(u32) {
|
|
_unused: u31 = 0,
|
|
/// the last bit of __u32
|
|
F_FD_IN_FIXED: bool = false,
|
|
};
|
|
|
|
/// POLL_ADD flags. Note that since sqe.poll_events (rw_flags in Zig Struct)
|
|
/// is the flag space, the command flags for POLL_ADD are stored in sqe.len.
|
|
pub const Poll = packed struct(u32) {
|
|
/// IORING_POLL_ADD_MULTI
|
|
/// Multishot poll. Sets IORING_CQE_F_MORE if the poll handler will continue
|
|
/// to report CQEs on behalf of the same SQE.
|
|
ADD_MULTI: bool = false,
|
|
// TODO: verify this doc comment is valid for the 2 flags below
|
|
/// IORING_POLL_UPDATE
|
|
/// Update existing poll request, matching sqe.addr as the old user_data
|
|
/// field.
|
|
UPDATE_EVENTS: bool = false,
|
|
/// IORING_POLL_UPDATE
|
|
/// Update existing poll request, matching sqe.addr as the old user_data
|
|
/// field.
|
|
UPDATE_USER_DATA: bool = false,
|
|
/// IORING_POLL_LEVEL
|
|
/// Level triggered poll.
|
|
ADD_LEVEL: bool = false,
|
|
_unused: u28 = 0,
|
|
};
|
|
|
|
/// ASYNC_CANCEL flags.
|
|
pub const AsyncCancel = packed struct(u32) {
|
|
/// IORING_ASYNC_CANCEL_ALL
|
|
/// Cancel all requests that match the given key
|
|
CANCEL_ALL: bool = false,
|
|
/// IORING_ASYNC_CANCEL_FD
|
|
/// Key off 'fd' for cancelation rather than the request 'user_data'
|
|
CANCEL_FD: bool = false,
|
|
/// IORING_ASYNC_CANCEL_ANY
|
|
/// Match any request
|
|
CANCEL_ANY: bool = false,
|
|
/// IORING_ASYNC_CANCEL_FD_FIXED
|
|
/// 'fd' passed in is a fixed descriptor
|
|
CANCEL_FD_FIXED: bool = false,
|
|
// COMMIT: new AsyncCancel Flags
|
|
/// IORING_ASYNC_CANCEL_USERDATA
|
|
/// Match on user_data, default for no other key
|
|
CANCEL_USERDATA: bool = false,
|
|
/// IORING_ASYNC_CANCEL_OP
|
|
/// Match request based on opcode
|
|
CANCEL_OP: bool = false,
|
|
_unused: u26 = 0,
|
|
};
|
|
|
|
/// send/sendmsg and recv/recvmsg flags (sqe.ioprio)
|
|
pub const SendRecv = packed struct(u16) {
|
|
/// IORING_RECVSEND_POLL_FIRST
|
|
/// If set, instead of first attempting to send or receive and arm poll
|
|
/// if that yields an -EAGAIN result, arm poll upfront and skip the
|
|
/// initial transfer attempt.
|
|
RECVSEND_POLL_FIRST: bool = false,
|
|
/// IORING_RECV_MULTISHOT
|
|
/// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue
|
|
/// to report CQEs on behalf of the same SQE.
|
|
RECV_MULTISHOT: bool = false,
|
|
/// IORING_RECVSEND_FIXED_BUF
|
|
/// Use registered buffers, the index is stored in the buf_index field.
|
|
RECVSEND_FIXED_BUF: bool = false,
|
|
/// IORING_SEND_ZC_REPORT_USAGE
|
|
/// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res
|
|
/// for the IORING_CQE_F_NOTIF cqe. 0 is reported if zerocopy was
|
|
/// actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied
|
|
/// (at least partially).
|
|
SEND_ZC_REPORT_USAGE: bool = false,
|
|
/// IORING_RECVSEND_BUNDLE
|
|
/// Used with IOSQE_BUFFER_SELECT. If set, send or recv will grab as
|
|
/// many buffers from the buffer group ID given and send them all.
|
|
/// The completion result will be the number of buffers send, with the
|
|
/// starting buffer ID in cqe.flags as per usual for provided buffer
|
|
/// usage. The buffers will be contiguous from the starting buffer ID.
|
|
RECVSEND_BUNDLE: bool = false,
|
|
// COMMIT: new flags
|
|
/// IORING_SEND_VECTORIZED
|
|
/// If set, SEND[_ZC] will take a pointer to a io_vec to allow
|
|
/// vectorized send operations.
|
|
SEND_VECTORIZED: bool = false,
|
|
_: u10 = 0,
|
|
};
|
|
|
|
/// accept flags stored in sqe.ioprio
|
|
pub const Accept = packed struct(u16) {
|
|
MULTISHOT: bool = false,
|
|
// COMMIT: new Flags
|
|
DONTWAIT: bool = false,
|
|
POLL_FIRST: bool = false,
|
|
_unused: u13 = 0,
|
|
};
|
|
|
|
/// IORING_OP_MSG_RING flags (sqe.msg_ring_flags or sqe.rw_flags in Zig Struct)
|
|
pub const MsgRing = packed struct(u32) {
|
|
/// IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring.
|
|
/// Not applicable for IORING_MSG_DATA, obviously.
|
|
CQE_SKIP: bool = false,
|
|
/// Pass through the flags from sqe.file_index to cqe.flags
|
|
FLAGS_PASS: bool = false,
|
|
_unused: u30 = 0,
|
|
};
|
|
|
|
// COMMIT: new flag
|
|
/// IORING_OP_FIXED_FD_INSTALL flags (sqe.install_fd_flags or sqe.rw_flags in Zig Struct)
|
|
pub const FixedFd = packed struct(u32) {
|
|
/// IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC
|
|
NO_CLOEXEC: bool = false,
|
|
};
|
|
|
|
/// COMMIT: new flags
|
|
/// IORING_OP_NOP flags (sqe.nop_flags or sqe.rw_flags in Zig Struct)
|
|
pub const Nop = packed struct(u32) {
|
|
/// IORING_NOP_INJECT_RESULT Inject result from sqe.result
|
|
INJECT_RESULT: bool = false,
|
|
_unused: u4 = 0,
|
|
CQE32: bool = false,
|
|
_unused_1: u26 = 0,
|
|
};
|
|
|
|
/// io_uring_enter(2) flags
|
|
pub const Enter = packed struct(u32) {
|
|
GETEVENTS: bool = false,
|
|
SQ_WAKEUP: bool = false,
|
|
SQ_WAIT: bool = false,
|
|
EXT_ARG: bool = false,
|
|
REGISTERED_RING: bool = false,
|
|
// COMMIT: new flags
|
|
ABS_TIMER: bool = false,
|
|
EXT_ARG_REG: bool = false,
|
|
NO_IOWAIT: bool = false,
|
|
_unused: u24 = 0,
|
|
};
|
|
|
|
/// io_uring_params.features flags
|
|
const Features = packed struct(u32) {
|
|
SINGLE_MMAP: bool = false,
|
|
NODROP: bool = false,
|
|
SUBMIT_STABLE: bool = false,
|
|
RW_CUR_POS: bool = false,
|
|
CUR_PERSONALITY: bool = false,
|
|
FAST_POLL: bool = false,
|
|
POLL_32BITS: bool = false,
|
|
SQPOLL_NONFIXED: bool = false,
|
|
EXT_ARG: bool = false,
|
|
NATIVE_WORKERS: bool = false,
|
|
RSRC_TAGS: bool = false,
|
|
CQE_SKIP: bool = false,
|
|
LINKED_FILE: bool = false,
|
|
// COMMIT: add new Feature Flags
|
|
REG_REG_RING: bool = false,
|
|
RECVSEND_BUNDLE: bool = false,
|
|
MIN_TIMEOUT: bool = false,
|
|
RW_ATTR: bool = false,
|
|
NO_IOWAIT: bool = false,
|
|
_unused: u14 = 0,
|
|
};
|
|
};
|
|
|
|
// IO completion data structure (Completion Queue Entry)
|
|
pub const Cqe = extern struct {
|
|
/// sqe.user_data value passed back
|
|
user_data: u64,
|
|
/// result code for this event
|
|
res: i32,
|
|
flags: Flags,
|
|
// COMMIT: add big_cqe which was missing in io_uring_cqe type declaration
|
|
// TODO: add support for the IORING_SETUP_CQE32 case
|
|
/// If the ring is initialized with IORING_SETUP_CQE32, then this field
|
|
/// contains 16-bytes of padding, doubling the size of the CQE.
|
|
// big_cqe: ?[2]u64,
|
|
|
|
/// cqe.flags
|
|
pub const Flags = packed struct(u32) {
|
|
/// IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
|
|
F_BUFFER: bool = false,
|
|
/// IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
|
|
F_MORE: bool = false,
|
|
/// IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
|
|
F_SOCK_NONEMPTY: bool = false,
|
|
/// IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
|
|
/// them from sends.
|
|
F_NOTIF: bool = false,
|
|
/// IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
|
|
/// more completions. In other words, the buffer is being
|
|
/// partially consumed, and will be used by the kernel for
|
|
/// more completions. This is only set for buffers used via
|
|
/// the incremental buffer consumption, as provided by
|
|
/// a ring buffer setup with IOU_PBUF_RING_INC. For any
|
|
/// other provided buffer type, all completions with a
|
|
/// buffer passed back is automatically returned to the
|
|
/// application.
|
|
F_BUF_MORE: bool = false,
|
|
// COMMIT: new flags
|
|
/// IORING_CQE_F_SKIP If set, then the application/liburing must ignore this
|
|
/// CQE. It's only purpose is to fill a gap in the ring,
|
|
/// if a large CQE is attempted posted when the ring has
|
|
/// just a single small CQE worth of space left before
|
|
/// wrapping.
|
|
F_SKIP: bool = false,
|
|
_unused: u9 = 0,
|
|
/// IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings
|
|
/// setup in a mixed CQE mode, where both 16b and 32b
|
|
/// CQEs may be posted to the CQ ring.
|
|
F_32: bool = false,
|
|
_unused_1: u16 = 0,
|
|
};
|
|
|
|
pub fn err(self: Cqe) linux.E {
|
|
if (self.res > -4096 and self.res < 0) {
|
|
return @as(linux.E, @enumFromInt(-self.res));
|
|
}
|
|
return .SUCCESS;
|
|
}
|
|
|
|
// On successful completion of the provided buffers IO request, the CQE flags field
|
|
// will have IORING_CQE_F_BUFFER set and the selected buffer ID will be indicated by
|
|
// the upper 16-bits of the flags field.
|
|
pub fn buffer_id(self: Cqe) !u16 {
|
|
if (!self.flags.F_BUFFER) {
|
|
return error.NoBufferSelected;
|
|
}
|
|
return @intCast(@as(u32, @bitCast(self.flags)) >> constants.CQE_BUFFER_SHIFT);
|
|
}
|
|
};
|
|
|
|
/// IO submission data structure (Submission Queue Entry)
|
|
/// matches io_uring_sqe in liburing
|
|
pub const Sqe = extern struct {
|
|
/// type of operation for this sqe
|
|
opcode: Op,
|
|
/// IOSQE_* flags
|
|
flags: IoSqe,
|
|
/// ioprio for the request
|
|
ioprio: packed union {
|
|
send_recv: uflags.SendRecv,
|
|
accept: uflags.Accept,
|
|
},
|
|
/// file descriptor to do IO on
|
|
fd: i32,
|
|
/// offset into file
|
|
off: u64,
|
|
/// pointer to buffer or iovecs
|
|
addr: u64,
|
|
/// buffer size or number of iovecs
|
|
len: u32,
|
|
/// flags for any sqe operation
|
|
/// rw_flags | fsync_flags | poll_event | poll32_event | sync_range_flags | msg_flags
|
|
/// timeout_flags | accept_flags | cancel_flags | open_flags | statx_flags
|
|
/// fadvise_advice | splice_flags | rename_flags | unlink_flags | hardlink_flags
|
|
/// xattr_flags | msg_ring_flags | uring_cmd_flags | waitid_flags | futex_flags
|
|
/// install_fd_flags | nop_flags | pipe_flags
|
|
rw_flags: u32,
|
|
/// data to be passed back at completion time
|
|
user_data: u64,
|
|
/// index into fixed buffers or for grouped buffer selection
|
|
buf_index: u16,
|
|
personality: u16,
|
|
splice_fd_in: i32,
|
|
addr3: u64,
|
|
resv: u64,
|
|
|
|
/// sqe.flags
|
|
pub const IoSqe = packed struct(u8) {
|
|
/// use fixed fileset
|
|
FIXED_FILE: bool = false,
|
|
/// issue after inflight IO
|
|
IO_DRAIN: bool = false,
|
|
/// links next sqe
|
|
IO_LINK: bool = false,
|
|
/// like LINK, but stronger
|
|
IO_HARDLINK: bool = false,
|
|
/// always go async
|
|
ASYNC: bool = false,
|
|
/// select buffer from sqe->buf_group
|
|
BUFFER_SELECT: bool = false,
|
|
/// don't post CQE if request succeeded
|
|
CQE_SKIP_SUCCESS: bool = false,
|
|
_: u1 = 0,
|
|
};
|
|
|
|
pub fn prep_nop(sqe: *Sqe) void {
|
|
sqe.* = .{
|
|
.opcode = .NOP,
|
|
.flags = .{},
|
|
.ioprio = @bitCast(@as(u16, 0)),
|
|
.fd = 0,
|
|
.off = 0,
|
|
.addr = 0,
|
|
.len = 0,
|
|
.rw_flags = 0,
|
|
.user_data = 0,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
};
|
|
}
|
|
|
|
pub fn prep_fsync(sqe: *Sqe, fd: linux.fd_t, flags: uflags.Fsync) void {
|
|
sqe.* = .{
|
|
.opcode = .FSYNC,
|
|
.flags = .{},
|
|
.ioprio = @bitCast(@as(u16, 0)),
|
|
.fd = fd,
|
|
.off = 0,
|
|
.addr = 0,
|
|
.len = 0,
|
|
.rw_flags = @bitCast(flags),
|
|
.user_data = 0,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
};
|
|
}
|
|
|
|
pub fn prep_rw(
|
|
sqe: *Sqe,
|
|
op: Op,
|
|
fd: linux.fd_t,
|
|
addr: u64,
|
|
len: usize,
|
|
offset: u64,
|
|
) void {
|
|
sqe.* = .{
|
|
.opcode = op,
|
|
.flags = .{},
|
|
.ioprio = @bitCast(@as(u16, 0)),
|
|
.fd = fd,
|
|
.off = offset,
|
|
.addr = addr,
|
|
.len = @intCast(len),
|
|
.rw_flags = 0,
|
|
.user_data = 0,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
};
|
|
}
|
|
|
|
pub fn prep_read(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, offset: u64) void {
|
|
sqe.prep_rw(.READ, fd, @intFromPtr(buffer.ptr), buffer.len, offset);
|
|
}
|
|
|
|
pub fn prep_write(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, offset: u64) void {
|
|
sqe.prep_rw(.WRITE, fd, @intFromPtr(buffer.ptr), buffer.len, offset);
|
|
}
|
|
|
|
pub fn prep_splice(sqe: *Sqe, fd_in: linux.fd_t, off_in: u64, fd_out: linux.fd_t, off_out: u64, len: usize) void {
|
|
sqe.prep_rw(.SPLICE, fd_out, undefined, len, off_out);
|
|
sqe.addr = off_in;
|
|
sqe.splice_fd_in = fd_in;
|
|
}
|
|
|
|
pub fn prep_readv(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
iovecs: []const std.posix.iovec,
|
|
offset: u64,
|
|
) void {
|
|
sqe.prep_rw(.READV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset);
|
|
}
|
|
|
|
pub fn prep_writev(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
iovecs: []const std.posix.iovec_const,
|
|
offset: u64,
|
|
) void {
|
|
sqe.prep_rw(.WRITEV, fd, @intFromPtr(iovecs.ptr), iovecs.len, offset);
|
|
}
|
|
|
|
pub fn prep_read_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void {
|
|
sqe.prep_rw(.READ_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset);
|
|
sqe.buf_index = buffer_index;
|
|
}
|
|
|
|
pub fn prep_write_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: *std.posix.iovec, offset: u64, buffer_index: u16) void {
|
|
sqe.prep_rw(.WRITE_FIXED, fd, @intFromPtr(buffer.base), buffer.len, offset);
|
|
sqe.buf_index = buffer_index;
|
|
}
|
|
|
|
pub fn prep_accept(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
addr: ?*linux.sockaddr,
|
|
addrlen: ?*linux.socklen_t,
|
|
flags: linux.SOCK,
|
|
) void {
|
|
// `addr` holds a pointer to `sockaddr`, and `addr2` holds a pointer to socklen_t`.
|
|
// `addr2` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32).
|
|
sqe.prep_rw(.ACCEPT, fd, @intFromPtr(addr), 0, @intFromPtr(addrlen));
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
/// accept directly into the fixed file table
|
|
pub fn prep_accept_direct(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
addr: ?*linux.sockaddr,
|
|
addrlen: ?*linux.socklen_t,
|
|
flags: linux.SOCK,
|
|
file_index: u32,
|
|
) void {
|
|
prep_accept(sqe, fd, addr, addrlen, flags);
|
|
set_target_fixed_file(sqe, file_index);
|
|
}
|
|
|
|
pub fn prep_multishot_accept(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
addr: ?*linux.sockaddr,
|
|
addrlen: ?*linux.socklen_t,
|
|
flags: linux.SOCK,
|
|
) void {
|
|
prep_accept(sqe, fd, addr, addrlen, flags);
|
|
sqe.ioprio = .{ .accept = .{ .MULTISHOT = true } };
|
|
}
|
|
|
|
/// multishot accept directly into the fixed file table
|
|
pub fn prep_multishot_accept_direct(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
addr: ?*linux.sockaddr,
|
|
addrlen: ?*linux.socklen_t,
|
|
flags: linux.SOCK,
|
|
) void {
|
|
prep_multishot_accept(sqe, fd, addr, addrlen, flags);
|
|
set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC);
|
|
}
|
|
|
|
fn set_target_fixed_file(sqe: *Sqe, file_index: u32) void {
|
|
const sqe_file_index: u32 = if (file_index == constants.FILE_INDEX_ALLOC)
|
|
constants.FILE_INDEX_ALLOC
|
|
else
|
|
// 0 means no fixed files, indexes should be encoded as "index + 1"
|
|
file_index + 1;
|
|
// This filed is overloaded in liburing:
|
|
// splice_fd_in: i32
|
|
// sqe_file_index: u32
|
|
sqe.splice_fd_in = @bitCast(sqe_file_index);
|
|
}
|
|
|
|
pub fn prep_connect(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
addr: *const linux.sockaddr,
|
|
addrlen: linux.socklen_t,
|
|
) void {
|
|
// `addrlen` maps to `sqe.off` (u64) instead of `sqe.len` (which is only a u32).
|
|
sqe.prep_rw(.CONNECT, fd, @intFromPtr(addr), 0, addrlen);
|
|
}
|
|
|
|
pub fn prep_epoll_ctl(
|
|
sqe: *Sqe,
|
|
epfd: linux.fd_t,
|
|
fd: linux.fd_t,
|
|
op: u32,
|
|
ev: ?*linux.epoll_event,
|
|
) void {
|
|
sqe.prep_rw(.EPOLL_CTL, epfd, @intFromPtr(ev), op, @intCast(fd));
|
|
}
|
|
|
|
pub fn prep_recv(sqe: *Sqe, fd: linux.fd_t, buffer: []u8, flags: linux.MSG) void {
|
|
sqe.prep_rw(.RECV, fd, @intFromPtr(buffer.ptr), buffer.len, 0);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
// TODO: review recv `flags`
|
|
pub fn prep_recv_multishot(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
buffer: []u8,
|
|
flags: linux.MSG,
|
|
) void {
|
|
sqe.prep_recv(fd, buffer, flags);
|
|
sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } };
|
|
}
|
|
|
|
pub fn prep_recvmsg(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
msg: *linux.msghdr,
|
|
flags: linux.MSG,
|
|
) void {
|
|
sqe.prep_rw(.RECVMSG, fd, @intFromPtr(msg), 1, 0);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_recvmsg_multishot(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
msg: *linux.msghdr,
|
|
flags: linux.MSG,
|
|
) void {
|
|
sqe.prep_recvmsg(fd, msg, flags);
|
|
sqe.ioprio = .{ .send_recv = .{ .RECV_MULTISHOT = true } };
|
|
}
|
|
|
|
// COMMIT: fix send[|recv] flag param type
|
|
pub fn prep_send(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG) void {
|
|
sqe.prep_rw(.SEND, fd, @intFromPtr(buffer.ptr), buffer.len, 0);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_send_zc(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: uflags.SendRecv) void {
|
|
sqe.prep_rw(.SEND_ZC, fd, @intFromPtr(buffer.ptr), buffer.len, 0);
|
|
sqe.rw_flags = flags;
|
|
sqe.ioprio = .{ .send_recv = zc_flags };
|
|
}
|
|
|
|
pub fn prep_send_zc_fixed(sqe: *Sqe, fd: linux.fd_t, buffer: []const u8, flags: linux.MSG, zc_flags: uflags.SendRecv, buf_index: u16) void {
|
|
const zc_flags_fixed = blk: {
|
|
var updated_flags = zc_flags;
|
|
updated_flags.RECVSEND_FIXED_BUF = true;
|
|
break :blk updated_flags;
|
|
};
|
|
prep_send_zc(sqe, fd, buffer, flags, zc_flags_fixed);
|
|
sqe.buf_index = buf_index;
|
|
}
|
|
|
|
pub fn prep_sendmsg(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
msg: *const linux.msghdr_const,
|
|
flags: linux.MSG,
|
|
) void {
|
|
sqe.prep_rw(.SENDMSG, fd, @intFromPtr(msg), 1, 0);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_sendmsg_zc(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
msg: *const linux.msghdr_const,
|
|
flags: linux.MSG,
|
|
) void {
|
|
prep_sendmsg(sqe, fd, msg, flags);
|
|
sqe.opcode = .SENDMSG_ZC;
|
|
}
|
|
|
|
pub fn prep_openat(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
flags: linux.O,
|
|
mode: linux.mode_t,
|
|
) void {
|
|
sqe.prep_rw(.OPENAT, fd, @intFromPtr(path), mode, 0);
|
|
sqe.rw_flags = @bitCast(flags);
|
|
}
|
|
|
|
pub fn prep_openat_direct(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
flags: linux.O,
|
|
mode: linux.mode_t,
|
|
file_index: u32,
|
|
) void {
|
|
prep_openat(sqe, fd, path, flags, mode);
|
|
set_target_fixed_file(sqe, file_index);
|
|
}
|
|
|
|
pub fn prep_close(sqe: *Sqe, fd: linux.fd_t) void {
|
|
sqe.* = .{
|
|
.opcode = .CLOSE,
|
|
.flags = .{},
|
|
.ioprio = @bitCast(@as(u16, 0)),
|
|
.fd = fd,
|
|
.off = 0,
|
|
.addr = 0,
|
|
.len = 0,
|
|
.rw_flags = 0,
|
|
.user_data = 0,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
};
|
|
}
|
|
|
|
pub fn prep_close_direct(sqe: *Sqe, file_index: u32) void {
|
|
prep_close(sqe, 0);
|
|
set_target_fixed_file(sqe, file_index);
|
|
}
|
|
|
|
pub fn prep_timeout(
|
|
sqe: *Sqe,
|
|
ts: *const linux.kernel_timespec,
|
|
count: u32,
|
|
flags: uflags.Timeout,
|
|
) void {
|
|
sqe.prep_rw(.TIMEOUT, -1, @intFromPtr(ts), 1, count);
|
|
sqe.rw_flags = @bitCast(flags);
|
|
}
|
|
|
|
pub fn prep_timeout_remove(sqe: *Sqe, timeout_user_data: u64, flags: uflags.Timeout) void {
|
|
sqe.* = .{
|
|
.opcode = .TIMEOUT_REMOVE,
|
|
.flags = .{},
|
|
.ioprio = @bitCast(@as(u16, 0)),
|
|
.fd = -1,
|
|
.off = 0,
|
|
.addr = timeout_user_data,
|
|
.len = 0,
|
|
.rw_flags = @bitCast(flags),
|
|
.user_data = 0,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
};
|
|
}
|
|
|
|
pub fn prep_link_timeout(
|
|
sqe: *Sqe,
|
|
ts: *const linux.kernel_timespec,
|
|
flags: uflags.Timeout,
|
|
) void {
|
|
sqe.prep_rw(.LINK_TIMEOUT, -1, @intFromPtr(ts), 1, 0);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_poll_add(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
poll_mask: linux.POLL,
|
|
) void {
|
|
sqe.prep_rw(.POLL_ADD, fd, @intFromPtr(@as(?*anyopaque, null)), 0, 0);
|
|
// Poll masks previously used to comprise of 16 bits in the flags union of
|
|
// a SQE, but were then extended to comprise of 32 bits in order to make
|
|
// room for additional option flags. To ensure that the correct bits of
|
|
// poll masks are consistently and properly read across multiple kernel
|
|
// versions, poll masks are enforced to be little-endian.
|
|
// https://www.spinics.net/lists/io-uring/msg02848.html
|
|
sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask);
|
|
}
|
|
|
|
pub fn prep_poll_remove(
|
|
sqe: *Sqe,
|
|
target_user_data: u64,
|
|
) void {
|
|
sqe.prep_rw(.POLL_REMOVE, -1, target_user_data, 0, 0);
|
|
}
|
|
|
|
pub fn prep_poll_update(
|
|
sqe: *Sqe,
|
|
old_user_data: u64,
|
|
new_user_data: u64,
|
|
poll_mask: linux.POLL,
|
|
flags: uflags.Poll,
|
|
) void {
|
|
sqe.prep_rw(.POLL_REMOVE, -1, old_user_data, flags, new_user_data);
|
|
// Poll masks previously used to comprise of 16 bits in the flags union of
|
|
// a SQE, but were then extended to comprise of 32 bits in order to make
|
|
// room for additional option flags. To ensure that the correct bits of
|
|
// poll masks are consistently and properly read across multiple kernel
|
|
// versions, poll masks are enforced to be little-endian.
|
|
// https://www.spinics.net/lists/io-uring/msg02848.html
|
|
sqe.rw_flags = std.mem.nativeToLittle(u32, poll_mask);
|
|
}
|
|
|
|
pub fn prep_fallocate(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
mode: i32,
|
|
offset: u64,
|
|
len: u64,
|
|
) void {
|
|
sqe.* = .{
|
|
.opcode = .FALLOCATE,
|
|
.flags = .{},
|
|
.ioprio = @bitCast(@as(u16, 0)),
|
|
.fd = fd,
|
|
.off = offset,
|
|
.addr = len,
|
|
.len = @intCast(mode),
|
|
.rw_flags = 0,
|
|
.user_data = 0,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
};
|
|
}
|
|
|
|
pub fn prep_statx(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
flags: linux.AT,
|
|
mask: linux.STATX, // TODO: compose linux.STATX
|
|
buf: *linux.Statx,
|
|
) void {
|
|
sqe.prep_rw(.STATX, fd, @intFromPtr(path), mask, @intFromPtr(buf));
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_cancel(
|
|
sqe: *Sqe,
|
|
cancel_user_data: u64,
|
|
flags: uflags.AsyncCancel,
|
|
) void {
|
|
sqe.prep_rw(.ASYNC_CANCEL, -1, cancel_user_data, 0, 0);
|
|
sqe.rw_flags = @bitCast(flags);
|
|
}
|
|
|
|
pub fn prep_cancel_fd(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
flags: uflags.AsyncCancel,
|
|
) void {
|
|
sqe.prep_rw(.ASYNC_CANCEL, fd, 0, 0, 0);
|
|
const enable_cancel_fd = blk: {
|
|
var update_flags = flags;
|
|
update_flags.CANCEL_FD = true;
|
|
break :blk update_flags;
|
|
};
|
|
sqe.rw_flags = @bitCast(enable_cancel_fd);
|
|
}
|
|
|
|
pub fn prep_shutdown(
|
|
sqe: *Sqe,
|
|
sockfd: linux.socket_t,
|
|
how: linux.SHUT,
|
|
) void {
|
|
sqe.prep_rw(.SHUTDOWN, sockfd, 0, how, 0);
|
|
}
|
|
|
|
pub fn prep_renameat(
|
|
sqe: *Sqe,
|
|
old_dir_fd: linux.fd_t,
|
|
old_path: [*:0]const u8,
|
|
new_dir_fd: linux.fd_t,
|
|
new_path: [*:0]const u8,
|
|
flags: linux.RENAME,
|
|
) void {
|
|
sqe.prep_rw(
|
|
.RENAMEAT,
|
|
old_dir_fd,
|
|
@intFromPtr(old_path),
|
|
0,
|
|
@intFromPtr(new_path),
|
|
);
|
|
sqe.len = @bitCast(new_dir_fd);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_unlinkat(
|
|
sqe: *Sqe,
|
|
dir_fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
flags: linux.AT, // TODO: unlink flags only AT_REMOVEDIR
|
|
) void {
|
|
sqe.prep_rw(.UNLINKAT, dir_fd, @intFromPtr(path), 0, 0);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_mkdirat(
|
|
sqe: *Sqe,
|
|
dir_fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
mode: linux.mode_t,
|
|
) void {
|
|
sqe.prep_rw(.MKDIRAT, dir_fd, @intFromPtr(path), mode, 0);
|
|
}
|
|
|
|
pub fn prep_symlinkat(
|
|
sqe: *Sqe,
|
|
target: [*:0]const u8,
|
|
new_dir_fd: linux.fd_t,
|
|
link_path: [*:0]const u8,
|
|
) void {
|
|
sqe.prep_rw(
|
|
.SYMLINKAT,
|
|
new_dir_fd,
|
|
@intFromPtr(target),
|
|
0,
|
|
@intFromPtr(link_path),
|
|
);
|
|
}
|
|
|
|
pub fn prep_linkat(
|
|
sqe: *Sqe,
|
|
old_dir_fd: linux.fd_t,
|
|
old_path: [*:0]const u8,
|
|
new_dir_fd: linux.fd_t,
|
|
new_path: [*:0]const u8,
|
|
flags: linux.AT, // only AT_EMPTY_PATH, AT_SYMLINK_FOLLOW
|
|
) void {
|
|
sqe.prep_rw(
|
|
.LINKAT,
|
|
old_dir_fd,
|
|
@intFromPtr(old_path),
|
|
0,
|
|
@intFromPtr(new_path),
|
|
);
|
|
sqe.len = @bitCast(new_dir_fd);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_files_update(
|
|
sqe: *Sqe,
|
|
fds: []const linux.fd_t,
|
|
offset: u32,
|
|
) void {
|
|
sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, @intCast(offset));
|
|
}
|
|
|
|
pub fn prep_files_update_alloc(
|
|
sqe: *Sqe,
|
|
fds: []linux.fd_t,
|
|
) void {
|
|
sqe.prep_rw(.FILES_UPDATE, -1, @intFromPtr(fds.ptr), fds.len, constants.FILE_INDEX_ALLOC);
|
|
}
|
|
|
|
// TODO: why can't slice be used here ?
|
|
pub fn prep_provide_buffers(
|
|
sqe: *Sqe,
|
|
buffers: [*]u8,
|
|
buffer_len: usize,
|
|
num: usize,
|
|
group_id: usize,
|
|
buffer_id: usize,
|
|
) void {
|
|
const ptr = @intFromPtr(buffers);
|
|
sqe.prep_rw(.PROVIDE_BUFFERS, @intCast(num), ptr, buffer_len, buffer_id);
|
|
sqe.buf_index = @intCast(group_id);
|
|
}
|
|
|
|
pub fn prep_remove_buffers(
|
|
sqe: *Sqe,
|
|
num: usize,
|
|
group_id: usize,
|
|
) void {
|
|
sqe.prep_rw(.REMOVE_BUFFERS, @intCast(num), 0, 0, 0);
|
|
sqe.buf_index = @intCast(group_id);
|
|
}
|
|
|
|
pub fn prep_socket(
|
|
sqe: *Sqe,
|
|
domain: linux.AF,
|
|
socket_type: linux.SOCK,
|
|
protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7
|
|
flags: u32, // flags is unused
|
|
) void {
|
|
sqe.prep_rw(.SOCKET, @intCast(domain), 0, protocol, socket_type);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_socket_direct(
|
|
sqe: *Sqe,
|
|
domain: linux.AF,
|
|
socket_type: linux.SOCK,
|
|
protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7
|
|
flags: u32, // flags is unused
|
|
file_index: u32,
|
|
) void {
|
|
prep_socket(sqe, domain, socket_type, protocol, flags);
|
|
set_target_fixed_file(sqe, file_index);
|
|
}
|
|
|
|
pub fn prep_socket_direct_alloc(
|
|
sqe: *Sqe,
|
|
domain: linux.AF,
|
|
socket_type: linux.SOCK,
|
|
protocol: u32, // Enumerate https://github.com/kraj/musl/blob/kraj/master/src/network/proto.c#L7
|
|
flags: u32, // flags is unused
|
|
) void {
|
|
prep_socket(sqe, domain, socket_type, protocol, flags);
|
|
set_target_fixed_file(sqe, constants.FILE_INDEX_ALLOC);
|
|
}
|
|
|
|
pub fn prep_waitid(
|
|
sqe: *Sqe,
|
|
id_type: linux.P,
|
|
id: i32,
|
|
infop: *linux.siginfo_t,
|
|
options: linux.W,
|
|
flags: u32, // flags is unused
|
|
) void {
|
|
sqe.prep_rw(.WAITID, id, 0, @intFromEnum(id_type), @intFromPtr(infop));
|
|
sqe.rw_flags = flags;
|
|
sqe.splice_fd_in = @bitCast(options);
|
|
}
|
|
|
|
// TODO: maybe remove unused flag fields?
|
|
pub fn prep_bind(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
addr: *const linux.sockaddr,
|
|
addrlen: linux.socklen_t,
|
|
flags: u32, // flags is unused and does't exist in io_uring's api
|
|
) void {
|
|
sqe.prep_rw(.BIND, fd, @intFromPtr(addr), 0, addrlen);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_listen(
|
|
sqe: *Sqe,
|
|
fd: linux.fd_t,
|
|
backlog: usize,
|
|
flags: u32, // flags is unused and does't exist in io_uring's api
|
|
) void {
|
|
sqe.prep_rw(.LISTEN, fd, 0, backlog, 0);
|
|
sqe.rw_flags = flags;
|
|
}
|
|
|
|
pub fn prep_cmd_sock(
|
|
sqe: *Sqe,
|
|
cmd_op: SocketOp,
|
|
fd: linux.fd_t,
|
|
level: linux.SOL,
|
|
optname: linux.SO,
|
|
optval: u64,
|
|
optlen: u32,
|
|
) void {
|
|
sqe.prep_rw(.URING_CMD, fd, 0, 0, 0);
|
|
// off is overloaded with cmd_op, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L39
|
|
sqe.off = @intFromEnum(cmd_op);
|
|
// addr is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L46
|
|
sqe.addr = @bitCast(packed struct {
|
|
level: u32,
|
|
optname: u32,
|
|
}{
|
|
.level = level,
|
|
.optname = optname,
|
|
});
|
|
// splice_fd_in if overloaded u32 -> i32
|
|
sqe.splice_fd_in = @bitCast(optlen);
|
|
// addr3 is overloaded, https://github.com/axboe/liburing/blob/e1003e496e66f9b0ae06674869795edf772d5500/src/include/liburing/io_uring.h#L102
|
|
sqe.addr3 = optval;
|
|
}
|
|
|
|
pub fn set_flags(sqe: *Sqe, flags: Sqe.IoSqe) void {
|
|
const updated_flags = @as(u8, @bitCast(sqe.flags)) | @as(u8, @bitCast(flags));
|
|
sqe.flags = @bitCast(updated_flags);
|
|
}
|
|
|
|
/// This SQE forms a link with the next SQE in the submission ring. Next SQE
|
|
/// will not be started before this one completes. Forms a chain of SQEs.
|
|
pub fn link_next(sqe: *Sqe) void {
|
|
sqe.flags.IO_LINK = true;
|
|
}
|
|
};
|
|
|
|
/// Filled with the offset for mmap(2)
|
|
/// matches io_sqring_offsets in liburing
|
|
pub const SqOffsets = extern struct {
|
|
/// offset of ring head
|
|
head: u32,
|
|
/// offset of ring tail
|
|
tail: u32,
|
|
/// ring mask value
|
|
ring_mask: u32,
|
|
/// entries in ring
|
|
ring_entries: u32,
|
|
/// ring flags
|
|
flags: u32,
|
|
/// number of sqes not submitted
|
|
dropped: u32,
|
|
/// sqe index array
|
|
array: u32,
|
|
resv1: u32,
|
|
user_addr: u64,
|
|
};
|
|
|
|
/// matches io_cqring_offsets in liburing
|
|
pub const CqOffsets = extern struct {
|
|
head: u32,
|
|
tail: u32,
|
|
ring_mask: u32,
|
|
ring_entries: u32,
|
|
overflow: u32,
|
|
cqes: u32,
|
|
flags: u32,
|
|
resv: u32,
|
|
user_addr: u64,
|
|
};
|
|
|
|
/// Passed in for io_uring_setup(2). Copied back with updated info on success
|
|
/// matches io_uring_params in liburing
|
|
pub const Params = extern struct {
|
|
sq_entries: u32,
|
|
cq_entries: u32,
|
|
flags: uflags.Setup,
|
|
sq_thread_cpu: u32,
|
|
sq_thread_idle: u32,
|
|
features: uflags.Features,
|
|
wq_fd: u32,
|
|
resv: [3]u32,
|
|
sq_off: SqOffsets,
|
|
cq_off: CqOffsets,
|
|
};
|
|
|
|
/// io_uring_register(2) opcodes and arguments
|
|
/// matches io_uring_register_op in liburing
|
|
pub const RegisterOp = enum(u8) {
|
|
REGISTER_BUFFERS,
|
|
UNREGISTER_BUFFERS,
|
|
REGISTER_FILES,
|
|
UNREGISTER_FILES,
|
|
REGISTER_EVENTFD,
|
|
UNREGISTER_EVENTFD,
|
|
REGISTER_FILES_UPDATE,
|
|
REGISTER_EVENTFD_ASYNC,
|
|
REGISTER_PROBE,
|
|
REGISTER_PERSONALITY,
|
|
UNREGISTER_PERSONALITY,
|
|
REGISTER_RESTRICTIONS,
|
|
REGISTER_ENABLE_RINGS,
|
|
|
|
// extended with tagging
|
|
REGISTER_FILES2,
|
|
REGISTER_FILES_UPDATE2,
|
|
REGISTER_BUFFERS2,
|
|
REGISTER_BUFFERS_UPDATE,
|
|
|
|
// set/clear io-wq thread affinities
|
|
REGISTER_IOWQ_AFF,
|
|
UNREGISTER_IOWQ_AFF,
|
|
|
|
// set/get max number of io-wq workers
|
|
REGISTER_IOWQ_MAX_WORKERS,
|
|
|
|
// register/unregister io_uring fd with the ring
|
|
REGISTER_RING_FDS,
|
|
UNREGISTER_RING_FDS,
|
|
|
|
// register ring based provide buffer group
|
|
REGISTER_PBUF_RING,
|
|
UNREGISTER_PBUF_RING,
|
|
|
|
// sync cancelation API
|
|
REGISTER_SYNC_CANCEL,
|
|
|
|
// register a range of fixed file slots for automatic slot allocation
|
|
REGISTER_FILE_ALLOC_RANGE,
|
|
|
|
// return status information for a buffer group
|
|
REGISTER_PBUF_STATUS,
|
|
|
|
// set/clear busy poll settings
|
|
REGISTER_NAPI,
|
|
UNREGISTER_NAPI,
|
|
|
|
REGISTER_CLOCK,
|
|
|
|
// clone registered buffers from source ring to current ring
|
|
REGISTER_CLONE_BUFFERS,
|
|
|
|
// send MSG_RING without having a ring
|
|
REGISTER_SEND_MSG_RING,
|
|
|
|
// register a netdev hw rx queue for zerocopy
|
|
REGISTER_ZCRX_IFQ,
|
|
|
|
// resize CQ ring
|
|
REGISTER_RESIZE_RINGS,
|
|
|
|
REGISTER_MEM_REGION,
|
|
|
|
// COMMIT: new register opcode
|
|
// query various aspects of io_uring, see linux/io_uring/query.h
|
|
REGISTER_QUERY,
|
|
|
|
_,
|
|
};
|
|
|
|
/// io-wq worker categories
|
|
/// matches io_wq_type in liburing
|
|
pub const IoWqCategory = enum(u8) {
|
|
BOUND,
|
|
UNBOUND,
|
|
};
|
|
|
|
// COMMIT: remove deprecated io_uring_rsrc_update struct
|
|
// deprecated, see struct io_uring_rsrc_update
|
|
|
|
// COMMIT: add new io_uring_region_desc struct
|
|
/// matches io_uring_region_desc in liburing
|
|
pub const RegionDesc = extern struct {
|
|
user_addr: u64,
|
|
size: u64,
|
|
flags: Flags,
|
|
id: u32,
|
|
mmap_offset: u64,
|
|
__resv: [4]u64,
|
|
|
|
// COMMIT: new constant
|
|
/// initialise with user provided memory pointed by user_addr
|
|
pub const Flags = packed struct(u32) {
|
|
TYPE_USER: bool = false,
|
|
_: u31 = 0,
|
|
};
|
|
};
|
|
|
|
// COMMIT: add new io_uring_mem_region_reg struct
|
|
/// matches io_uring_mem_region_reg in liburing
|
|
pub const MemRegionReg = extern struct {
|
|
/// struct io_uring_region_desc (RegionDesc in Zig)
|
|
region_uptr: u64,
|
|
flags: Flags,
|
|
__resv: [2]u64,
|
|
|
|
/// expose the region as registered wait arguments
|
|
pub const Flags = packed struct(u64) {
|
|
REG_WAIT_ARG: bool = false,
|
|
_: u63 = 0,
|
|
};
|
|
};
|
|
|
|
/// matches io_uring_rsrc_register in liburing
|
|
pub const RsrcRegister = extern struct {
|
|
nr: u32,
|
|
flags: u32,
|
|
resv2: u64,
|
|
data: u64,
|
|
tags: u64,
|
|
};
|
|
|
|
/// matches io_uring_rsrc_update in liburing
|
|
pub const RsrcUpdate = extern struct {
|
|
offset: u32,
|
|
resv: u32,
|
|
data: u64,
|
|
};
|
|
|
|
/// matches io_uring_rsrc_update2 in liburing
|
|
pub const RsrcUpdate2 = extern struct {
|
|
offset: u32,
|
|
resv: u32,
|
|
data: u64,
|
|
tags: u64,
|
|
nr: u32,
|
|
resv2: u32,
|
|
};
|
|
|
|
/// matches io_uring_probe_op in liburing
|
|
pub const ProbeOp = extern struct {
|
|
op: Op,
|
|
resv: u8,
|
|
flags: Flags,
|
|
resv2: u32,
|
|
|
|
pub const Flags = packed struct(u16) {
|
|
OP_SUPPORTED: bool = false,
|
|
_: u15 = 0,
|
|
};
|
|
|
|
pub fn is_supported(self: ProbeOp) bool {
|
|
return self.flags.OP_SUPPORTED;
|
|
}
|
|
};
|
|
|
|
/// matches io_uring_probe in liburing
|
|
pub const Probe = extern struct {
|
|
/// Last opcode supported
|
|
last_op: Op,
|
|
/// Length of ops[] array below
|
|
ops_len: u8,
|
|
resv: u16,
|
|
resv2: [3]u32,
|
|
ops: [256]ProbeOp,
|
|
|
|
/// Is the operation supported on the running kernel.
|
|
pub fn is_supported(self: @This(), op: Op) bool {
|
|
const i = @intFromEnum(op);
|
|
if (i > @intFromEnum(self.last_op) or i >= self.ops_len)
|
|
return false;
|
|
return self.ops[i].is_supported();
|
|
}
|
|
};
|
|
|
|
// COMMIT: fix defination of io_uring_restriction
|
|
// RegisterOp is actually u8
|
|
/// matches io_uring_restriction in liburing
|
|
pub const Restriction = extern struct {
|
|
opcode: RestrictionOp,
|
|
arg: extern union {
|
|
/// IORING_RESTRICTION_REGISTER_OP
|
|
register_op: RegisterOp,
|
|
/// IORING_RESTRICTION_SQE_OP
|
|
sqe_op: Op,
|
|
/// IORING_RESTRICTION_SQE_FLAGS_*
|
|
sqe_flags: u8,
|
|
},
|
|
resv: u8,
|
|
resv2: [3]u32,
|
|
};
|
|
|
|
// COMMIT: add new struct type
|
|
/// matches io_uring_clock_register in liburing
|
|
pub const ClockRegister = extern struct {
|
|
clockid: u32,
|
|
__resv: [3]u32,
|
|
};
|
|
|
|
// COMMIT: add new struct type
|
|
/// matches io_uring_clone_buffers in liburing
|
|
pub const CloneBuffers = extern struct {
|
|
src_fd: u32,
|
|
flags: Flags,
|
|
src_off: u32,
|
|
dst_off: u32,
|
|
nr: u32,
|
|
pad: [3]u32,
|
|
|
|
// COMMIT: new flags
|
|
pub const Flags = packed struct(u32) {
|
|
REGISTER_SRC_REGISTERED: bool = false,
|
|
REGISTER_DST_REPLACE: bool = false,
|
|
_: u30 = 0,
|
|
};
|
|
};
|
|
|
|
/// matches io_uring_buf in liburing
|
|
pub const Buffer = extern struct {
|
|
addr: u64,
|
|
len: u32,
|
|
bid: u16,
|
|
resv: u16,
|
|
};
|
|
|
|
/// matches io_uring_buf_ring in liburing
|
|
pub const BufferRing = extern struct {
|
|
resv1: u64,
|
|
resv2: u32,
|
|
resv3: u16,
|
|
tail: u16,
|
|
};
|
|
|
|
/// argument for IORING_(UN)REGISTER_PBUF_RING
|
|
/// matches io_uring_buf_reg in liburing
|
|
pub const BufferRegister = extern struct {
|
|
ring_addr: u64,
|
|
ring_entries: u32,
|
|
bgid: u16,
|
|
flags: Flags,
|
|
resv: [3]u64,
|
|
|
|
// COMMIT: new IORING_REGISTER_PBUF_RING flags
|
|
/// Flags for IORING_REGISTER_PBUF_RING.
|
|
pub const Flags = packed struct(u16) {
|
|
/// IOU_PBUF_RING_MMAP:
|
|
/// If set, kernel will allocate the memory for the ring.
|
|
/// The application must not set a ring_addr in struct io_uring_buf_reg
|
|
/// instead it must subsequently call mmap(2) with the offset set
|
|
/// as: IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) to get
|
|
/// a virtual mapping for the ring.
|
|
IOU_PBUF_RING_MMAP: bool = false,
|
|
/// IOU_PBUF_RING_INC:
|
|
/// If set, buffers consumed from this buffer ring can be
|
|
/// consumed incrementally. Normally one (or more) buffers
|
|
/// are fully consumed. With incremental consumptions, it's
|
|
/// feasible to register big ranges of buffers, and each
|
|
/// use of it will consume only as much as it needs. This
|
|
/// requires that both the kernel and application keep
|
|
/// track of where the current read/recv index is at.
|
|
IOU_PBUF_RING_INC: bool = false,
|
|
_: u14 = 0,
|
|
};
|
|
};
|
|
|
|
/// argument for IORING_REGISTER_PBUF_STATUS
|
|
/// matches io_uring_buf_status in liburing
|
|
pub const BufferStatus = extern struct {
|
|
/// input
|
|
buf_group: u32,
|
|
/// output
|
|
head: u32,
|
|
resv: [8]u32,
|
|
};
|
|
|
|
/// argument for IORING_(UN)REGISTER_NAPI
|
|
/// matches io_uring_napi in liburing
|
|
pub const Napi = extern struct {
|
|
busy_poll_to: u32,
|
|
prefer_busy_poll: u8,
|
|
pad: [3]u8,
|
|
resv: u64,
|
|
};
|
|
|
|
// COMMIT: new struct type
|
|
/// Argument for io_uring_enter(2) with
|
|
/// IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
|
|
/// is an index into a previously registered fixed wait region described by
|
|
/// the below structure.
|
|
/// matches io_uring_reg_wait in liburing
|
|
pub const RegisterWait = extern struct {
|
|
ts: linux.kernel_timespec,
|
|
min_wait_usec: u32,
|
|
flags: Flags,
|
|
sigmask: u64,
|
|
sigmask_sz: u32,
|
|
pad: [3]u32,
|
|
pad2: [2]u64,
|
|
|
|
// COMMIT: new constant
|
|
pub const Flags = packed struct(u32) {
|
|
REG_WAIT_TS: bool = false,
|
|
_: u31 = 0,
|
|
};
|
|
};
|
|
|
|
/// Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
|
|
/// matches io_uring_getevents_arg in liburing
|
|
pub const GetEventsArg = extern struct {
|
|
sigmask: u64,
|
|
sigmask_sz: u32,
|
|
pad: u32,
|
|
ts: u64,
|
|
};
|
|
|
|
// COMMIT: fix type definition of io_uring_sync_cancel_reg
|
|
/// Argument for IORING_REGISTER_SYNC_CANCEL
|
|
/// matches io_uring_sync_cancel_reg in liburing
|
|
pub const SyncCancelRegister = extern struct {
|
|
addr: u64,
|
|
fd: i32,
|
|
flags: uflags.AsyncCancel,
|
|
timeout: linux.kernel_timespec,
|
|
opcode: Op,
|
|
pad: [7]u8,
|
|
pad2: [4]u64,
|
|
};
|
|
|
|
/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE
|
|
/// The range is specified as [off, off + len)
|
|
/// matches io_uring_file_index_range in liburing
|
|
pub const FileIndexRange = extern struct {
|
|
off: u32,
|
|
len: u32,
|
|
resv: u64,
|
|
};
|
|
|
|
/// matches io_uring_recvmsg_out in liburing
|
|
pub const RecvmsgOut = extern struct {
|
|
namelen: u32,
|
|
controllen: u32,
|
|
payloadlen: u32,
|
|
flags: u32,
|
|
};
|
|
|
|
/// Zero copy receive refill queue entry
|
|
/// matches io_uring_zcrx_rqe in liburing
|
|
pub const ZcrxRqe = extern struct {
|
|
off: u64,
|
|
len: u32,
|
|
__pad: u32,
|
|
};
|
|
|
|
/// matches io_uring_zcrx_cqe in liburing
|
|
pub const ZcrxCqe = extern struct {
|
|
off: u64,
|
|
__pad: u64,
|
|
};
|
|
|
|
/// matches io_uring_zcrx_offsets in liburing
|
|
pub const ZcrxOffsets = extern struct {
|
|
head: u32,
|
|
tail: u32,
|
|
rqes: u32,
|
|
__resv2: u32,
|
|
__resv: [2]u64,
|
|
};
|
|
|
|
/// matches io_uring_zcrx_area_reg in liburing
|
|
pub const ZcrxAreaRegister = extern struct {
|
|
addr: u64,
|
|
len: u64,
|
|
rq_area_token: u64,
|
|
flags: Flags,
|
|
dmabuf_fd: u32,
|
|
__resv2: [2]u64,
|
|
|
|
pub const Flags = packed struct(u32) {
|
|
DMABUF: bool = false,
|
|
_: u31 = 0,
|
|
};
|
|
};
|
|
|
|
/// Argument for IORING_REGISTER_ZCRX_IFQ
|
|
/// matches io_uring_zcrx_ifq_reg in liburing
|
|
pub const ZcrxIfqRegister = extern struct {
|
|
if_idx: u32,
|
|
if_rxq: u32,
|
|
rq_entries: u32,
|
|
// TODO: find out its flags, I suspect its ZcrxAreaRegister.Flags
|
|
flags: u32,
|
|
/// pointer to struct io_uring_zcrx_area_reg
|
|
area_ptr: u64,
|
|
/// struct io_uring_region_desc
|
|
region_ptr: u64,
|
|
offsets: ZcrxOffsets,
|
|
zcrx_id: u32,
|
|
__resv2: u32,
|
|
__resv: [3]u64,
|
|
};
|
|
|
|
pub const SocketOp = enum(u16) {
|
|
SIOCIN,
|
|
SIOCOUTQ,
|
|
GETSOCKOPT,
|
|
SETSOCKOPT,
|
|
// COMMIT: new socket op
|
|
TX_TIMESTAMP,
|
|
};
|
|
|
|
/// io_uring_restriction.opcode values
|
|
/// matches io_uring_register_restriction_op in liburing
|
|
pub const RestrictionOp = enum(u16) {
|
|
/// Allow an io_uring_register(2) opcode
|
|
REGISTER_OP = 0,
|
|
/// Allow an sqe opcode
|
|
SQE_OP = 1,
|
|
/// Allow sqe flags
|
|
SQE_FLAGS_ALLOWED = 2,
|
|
/// Require sqe flags (these flags must be set on each submission)
|
|
SQE_FLAGS_REQUIRED = 3,
|
|
|
|
_,
|
|
};
|
|
|
|
/// IORING_OP_MSG_RING command types, stored in sqe.addr
|
|
pub const MsgRingCmd = enum {
|
|
/// pass sqe->len as 'res' and off as user_data
|
|
DATA,
|
|
/// send a registered fd to another ring
|
|
SEND_FD,
|
|
};
|
|
|
|
// COMMIT: OP to IoUring
|
|
pub const Op = enum(u8) {
|
|
NOP,
|
|
READV,
|
|
WRITEV,
|
|
FSYNC,
|
|
READ_FIXED,
|
|
WRITE_FIXED,
|
|
POLL_ADD,
|
|
POLL_REMOVE,
|
|
SYNC_FILE_RANGE,
|
|
SENDMSG,
|
|
RECVMSG,
|
|
TIMEOUT,
|
|
TIMEOUT_REMOVE,
|
|
ACCEPT,
|
|
ASYNC_CANCEL,
|
|
LINK_TIMEOUT,
|
|
CONNECT,
|
|
FALLOCATE,
|
|
OPENAT,
|
|
CLOSE,
|
|
FILES_UPDATE,
|
|
STATX,
|
|
READ,
|
|
WRITE,
|
|
FADVISE,
|
|
MADVISE,
|
|
SEND,
|
|
RECV,
|
|
EPOLL_CTL,
|
|
OPENAT2,
|
|
SPLICE,
|
|
PROVIDE_BUFFERS,
|
|
REMOVE_BUFFERS,
|
|
TEE,
|
|
SHUTDOWN,
|
|
RENAMEAT,
|
|
UNLINKAT,
|
|
MKDIRAT,
|
|
SYMLINKAT,
|
|
LINKAT,
|
|
MSG_RING,
|
|
FSETXATTR,
|
|
SETXATTR,
|
|
FGETXATTR,
|
|
GETXATTR,
|
|
SOCKET,
|
|
URING_CMD,
|
|
SEND_ZC,
|
|
SENDMSG_ZC,
|
|
READ_MULTISHOT,
|
|
WAITID,
|
|
FUTEX_WAIT,
|
|
FUTEX_WAKE,
|
|
FUTEX_WAITV,
|
|
FIXED_FD_INSTALL,
|
|
FTRUNCATE,
|
|
BIND,
|
|
LISTEN,
|
|
RECV_ZC,
|
|
// COMMIT: new OPs
|
|
// TODO: to be implemented
|
|
EPOLL_WAIT,
|
|
READV_FIXED,
|
|
WRITEV_FIXED,
|
|
PIPE,
|
|
|
|
_,
|
|
};
|
|
|
|
/// A friendly way to setup an io_uring, with default linux.io_uring_params.
|
|
/// `entries` must be a power of two between 1 and 32768, although the kernel will make the final
|
|
/// call on how many entries the submission and completion queues will ultimately have,
|
|
/// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L8027-L8050.
|
|
/// Matches the interface of io_uring_queue_init() in liburing.
|
|
pub fn init(entries: u16, flags: uflags.Setup) !IoUring {
|
|
var params = mem.zeroInit(Params, .{
|
|
.flags = flags,
|
|
.sq_thread_idle = 1000,
|
|
});
|
|
return try .init_params(entries, ¶ms);
|
|
}
|
|
|
|
/// A powerful way to setup an io_uring, if you want to tweak linux.io_uring_params such as submission
|
|
/// queue thread cpu affinity or thread idle timeout (the kernel and our default is 1 second).
|
|
/// `params` is passed by reference because the kernel needs to modify the parameters.
|
|
/// Matches the interface of io_uring_queue_init_params() in liburing.
|
|
pub fn init_params(entries: u16, p: *Params) !IoUring {
|
|
if (entries == 0) return error.EntriesZero;
|
|
if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo;
|
|
assert(p.sq_entries == 0);
|
|
assert(@as(u32, @bitCast(p.features)) == 0);
|
|
assert(p.resv[0] == 0);
|
|
assert(p.resv[1] == 0);
|
|
assert(p.resv[2] == 0);
|
|
|
|
const flags: uflags.Setup = @bitCast(p.flags);
|
|
assert(p.cq_entries == 0 or flags.CQSIZE);
|
|
assert(p.wq_fd == 0 or flags.ATTACH_WQ);
|
|
|
|
// flags compatibility
|
|
assert(flags.SQPOLL and !(flags.COOP_TASKRUN or flags.TASKRUN_FLAG or flags.DEFER_TASKRUN));
|
|
assert(flags.SQ_AFF and flags.SQPOLL);
|
|
assert(flags.DEFER_TASKRUN and flags.SINGLE_ISSUER);
|
|
|
|
const res = linux.io_uring_setup(entries, p);
|
|
switch (linux.errno(res)) {
|
|
.SUCCESS => {},
|
|
.FAULT => return error.ParamsOutsideAccessibleAddressSpace,
|
|
// The resv array contains non-zero data, p.flags contains an unsupported flag,
|
|
// entries out of bounds, IORING_SETUP_SQ_AFF was specified without IORING_SETUP_SQPOLL,
|
|
// or IORING_SETUP_CQSIZE was specified but linux.io_uring_params.cq_entries was invalid:
|
|
.INVAL => return error.ArgumentsInvalid,
|
|
.MFILE => return error.ProcessFdQuotaExceeded,
|
|
.NFILE => return error.SystemFdQuotaExceeded,
|
|
.NOMEM => return error.SystemResources,
|
|
// IORING_SETUP_SQPOLL was specified but effective user ID lacks sufficient privileges,
|
|
// or a container seccomp policy prohibits io_uring syscalls:
|
|
.PERM => return error.PermissionDenied,
|
|
.NOSYS => return error.SystemOutdated,
|
|
else => |errno| return posix.unexpectedErrno(errno),
|
|
}
|
|
const fd = @as(linux.fd_t, @intCast(res));
|
|
assert(fd >= 0);
|
|
errdefer posix.close(fd);
|
|
|
|
const features: uflags.Features = @bitCast(p.features);
|
|
// Kernel versions 5.4 and up use only one mmap() for the submission and completion queues.
|
|
// This is not an optional feature for us... if the kernel does it, we have to do it.
|
|
// The thinking on this by the kernel developers was that both the submission and the
|
|
// completion queue rings have sizes just over a power of two, but the submission queue ring
|
|
// is significantly smaller with u32 slots. By bundling both in a single mmap, the kernel
|
|
// gets the submission queue ring for free.
|
|
// See https://patchwork.kernel.org/patch/11115257 for the kernel patch.
|
|
// We do not support the double mmap() done before 5.4, because we want to keep the
|
|
// init/deinit mmap paths simple and because io_uring has had many bug fixes even since 5.4.
|
|
if (!features.SINGLE_MMAP) {
|
|
return error.SystemOutdated;
|
|
}
|
|
|
|
// Check that the kernel has actually set params and that "impossible is nothing".
|
|
assert(p.sq_entries != 0);
|
|
assert(p.cq_entries != 0);
|
|
assert(p.cq_entries >= p.sq_entries);
|
|
|
|
// From here on, we only need to read from params, so pass `p` by value as immutable.
|
|
// The completion queue shares the mmap with the submission queue, so pass `sq` there too.
|
|
var sq = try Sq.init(fd, p.*);
|
|
errdefer sq.deinit();
|
|
var cq = try Cq.init(fd, p.*, sq);
|
|
errdefer cq.deinit();
|
|
|
|
// Check that our starting state is as we expect.
|
|
assert(sq.head.* == 0);
|
|
assert(sq.tail.* == 0);
|
|
assert(sq.mask == p.sq_entries - 1);
|
|
// Allow flags.* to be non-zero, since the kernel may set IORING_SQ_NEED_WAKEUP at any time.
|
|
assert(sq.dropped.* == 0);
|
|
assert(sq.array.len == p.sq_entries);
|
|
assert(sq.sqes.len == p.sq_entries);
|
|
assert(sq.sqe_head == 0);
|
|
assert(sq.sqe_tail == 0);
|
|
|
|
assert(cq.head.* == 0);
|
|
assert(cq.tail.* == 0);
|
|
assert(cq.mask == p.cq_entries - 1);
|
|
assert(cq.overflow.* == 0);
|
|
assert(cq.cqes.len == p.cq_entries);
|
|
|
|
return .{
|
|
.fd = fd,
|
|
.sq = sq,
|
|
.cq = cq,
|
|
.flags = flags,
|
|
.features = features,
|
|
};
|
|
}
|
|
|
|
pub fn deinit(self: *IoUring) void {
|
|
assert(self.fd >= 0);
|
|
// The mmaps depend on the fd, so the order of these calls is important:
|
|
self.cq.deinit();
|
|
self.sq.deinit();
|
|
posix.close(self.fd);
|
|
self.fd = -1;
|
|
}
|
|
|
|
/// Returns a pointer to a vacant SQE, or an error if the submission queue is full.
|
|
/// We follow the implementation (and atomics) of liburing's `io_uring_get_sqe()` exactly.
|
|
/// However, instead of a null we return an error to force safe handling.
|
|
/// Any situation where the submission queue is full tends more towards a control flow error,
|
|
/// and the null return in liburing is more a C idiom than anything else, for lack of a better
|
|
/// alternative. In Zig, we have first-class error handling... so let's use it.
|
|
/// Matches the implementation of io_uring_get_sqe() in liburing.
|
|
pub fn get_sqe(self: *IoUring) !*Sqe {
|
|
const head = @atomicLoad(u32, self.sq.head, .acquire);
|
|
// Remember that these head and tail offsets wrap around every four billion operations.
|
|
// We must therefore use wrapping addition and subtraction to avoid a runtime crash.
|
|
const next = self.sq.sqe_tail +% 1;
|
|
if (next -% head > self.sq.sqes.len) return error.SubmissionQueueFull;
|
|
const sqe = &self.sq.sqes[self.sq.sqe_tail & self.sq.mask];
|
|
self.sq.sqe_tail = next;
|
|
return sqe;
|
|
}
|
|
|
|
/// Submits the SQEs acquired via get_sqe() to the kernel. You can call this once after you have
|
|
/// called get_sqe() multiple times to setup multiple I/O requests.
|
|
/// Returns the number of SQEs submitted, if not used alongside IORING_SETUP_SQPOLL.
|
|
/// If the io_uring instance is uses IORING_SETUP_SQPOLL, the value returned on success is not
|
|
/// guaranteed to match the amount of actually submitted sqes during this call. A value higher
|
|
/// or lower, including 0, may be returned.
|
|
/// Matches the implementation of io_uring_submit() in liburing.
|
|
pub fn submit(self: *IoUring) !u32 {
|
|
return self.submit_and_wait(0);
|
|
}
|
|
|
|
/// Like submit(), but allows waiting for events as well.
|
|
/// Returns the number of SQEs submitted.
|
|
/// Matches the implementation of io_uring_submit_and_wait() in liburing.
|
|
pub fn submit_and_wait(self: *IoUring, wait_nr: u32) !u32 {
|
|
const submitted = self.flush_sq();
|
|
var flags: uflags.Enter = .{};
|
|
if (self.sq_ring_needs_enter(&flags) or wait_nr > 0) {
|
|
if (wait_nr > 0 or self.flags.IOPOLL) {
|
|
flags.GETEVENTS = true;
|
|
}
|
|
return try self.enter(submitted, wait_nr, flags);
|
|
}
|
|
return submitted;
|
|
}
|
|
|
|
/// Tell the kernel we have submitted SQEs and/or want to wait for CQEs.
|
|
/// Returns the number of SQEs submitted.
|
|
pub fn enter(self: *IoUring, to_submit: u32, min_complete: u32, flags: uflags.Enter) !u32 {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_enter(self.fd, to_submit, min_complete, flags, null);
|
|
switch (linux.errno(res)) {
|
|
.SUCCESS => {},
|
|
// The kernel was unable to allocate memory or ran out of resources for the request.
|
|
// The application should wait for some completions and try again:
|
|
.AGAIN => return error.SystemResources,
|
|
// The SQE `fd` is invalid, or IOSQE_FIXED_FILE was set but no files were registered:
|
|
.BADF => return error.FileDescriptorInvalid,
|
|
// The file descriptor is valid, but the ring is not in the right state.
|
|
// See io_uring_register(2) for how to enable the ring.
|
|
.BADFD => return error.FileDescriptorInBadState,
|
|
// The application attempted to overcommit the number of requests it can have pending.
|
|
// The application should wait for some completions and try again:
|
|
.BUSY => return error.CompletionQueueOvercommitted,
|
|
// The SQE is invalid, or valid but the ring was setup with IORING_SETUP_IOPOLL:
|
|
.INVAL => return error.SubmissionQueueEntryInvalid,
|
|
// The buffer is outside the process' accessible address space, or IORING_OP_READ_FIXED
|
|
// or IORING_OP_WRITE_FIXED was specified but no buffers were registered, or the range
|
|
// described by `addr` and `len` is not within the buffer registered at `buf_index`:
|
|
.FAULT => return error.BufferInvalid,
|
|
.NXIO => return error.RingShuttingDown,
|
|
// The kernel believes our `self.fd` does not refer to an io_uring instance,
|
|
// or the opcode is valid but not supported by this kernel (more likely):
|
|
.OPNOTSUPP => return error.OpcodeNotSupported,
|
|
// The operation was interrupted by a delivery of a signal before it could complete.
|
|
// This can happen while waiting for events with IORING_ENTER_GETEVENTS:
|
|
.INTR => return error.SignalInterrupt,
|
|
else => |errno| return posix.unexpectedErrno(errno),
|
|
}
|
|
return @as(u32, @intCast(res));
|
|
}
|
|
|
|
/// Sync internal state with kernel ring state on the SQ side.
|
|
/// Returns the number of all pending events in the SQ ring, for the shared ring.
|
|
/// This return value includes previously flushed SQEs, as per liburing.
|
|
/// The rationale is to suggest that an io_uring_enter() call is needed rather than not.
|
|
/// Matches the implementation of __io_uring_flush_sq() in liburing.
|
|
pub fn flush_sq(self: *IoUring) u32 {
|
|
if (self.sq.sqe_head != self.sq.sqe_tail) {
|
|
// Fill in SQEs that we have queued up, adding them to the kernel ring.
|
|
const to_submit = self.sq.sqe_tail -% self.sq.sqe_head;
|
|
var tail = self.sq.tail.*;
|
|
var i: usize = 0;
|
|
while (i < to_submit) : (i += 1) {
|
|
self.sq.array[tail & self.sq.mask] = self.sq.sqe_head & self.sq.mask;
|
|
tail +%= 1;
|
|
self.sq.sqe_head +%= 1;
|
|
}
|
|
// Ensure that the kernel can actually see the SQE updates when it sees the tail update.
|
|
@atomicStore(u32, self.sq.tail, tail, .release);
|
|
}
|
|
return self.sq_ready();
|
|
}
|
|
|
|
/// Returns true if we are not using an SQ thread (thus nobody submits but us),
|
|
/// or if IORING_SQ_NEED_WAKEUP is set and the SQ thread must be explicitly awakened.
|
|
/// For the latter case, we set the SQ thread wakeup flag.
|
|
/// Matches the implementation of sq_ring_needs_enter() in liburing.
|
|
pub fn sq_ring_needs_enter(self: *IoUring, flags: *uflags.Enter) bool {
|
|
assert(@as(u32, @bitCast(flags.*)) == 0);
|
|
if (!self.flags.SQPOLL) return true;
|
|
if (@atomicLoad(Sq.Flags, self.sq.flags, .unordered).NEED_WAKEUP) {
|
|
flags.*.SQ_WAKEUP = true;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Returns the number of flushed and unflushed SQEs pending in the submission queue.
|
|
/// In other words, this is the number of SQEs in the submission queue, i.e. its length.
|
|
/// These are SQEs that the kernel is yet to consume.
|
|
/// Matches the implementation of io_uring_sq_ready in liburing.
|
|
pub fn sq_ready(self: *IoUring) u32 {
|
|
// Always use the shared ring state (i.e. head and not sqe_head) to avoid going out of sync,
|
|
// see https://github.com/axboe/liburing/issues/92.
|
|
return self.sq.sqe_tail -% @atomicLoad(u32, self.sq.head, .acquire);
|
|
}
|
|
|
|
/// Returns the number of CQEs in the completion queue, i.e. its length.
|
|
/// These are CQEs that the application is yet to consume.
|
|
/// Matches the implementation of io_uring_cq_ready in liburing.
|
|
pub fn cq_ready(self: *IoUring) u32 {
|
|
return @atomicLoad(u32, self.cq.tail, .acquire) -% self.cq.head.*;
|
|
}
|
|
|
|
/// Copies as many CQEs as are ready, and that can fit into the destination `cqes` slice.
|
|
/// If none are available, enters into the kernel to wait for at most `wait_nr` CQEs.
|
|
/// Returns the number of CQEs copied, advancing the CQ ring.
|
|
/// Provides all the wait/peek methods found in liburing, but with batching and a single method.
|
|
/// The rationale for copying CQEs rather than copying pointers is that pointers are 8 bytes
|
|
/// whereas CQEs are not much more at only 16 bytes, and this provides a safer faster interface.
|
|
/// Safer, because you no longer need to call cqe_seen(), avoiding idempotency bugs.
|
|
/// Faster, because we can now amortize the atomic store release to `cq.head` across the batch.
|
|
/// See https://github.com/axboe/liburing/issues/103#issuecomment-686665007.
|
|
/// Matches the implementation of io_uring_peek_batch_cqe() in liburing, but supports waiting.
|
|
pub fn copy_cqes(self: *IoUring, cqes: []Cqe, wait_nr: u32) !u32 {
|
|
const count = self.copy_cqes_ready(cqes);
|
|
if (count > 0) return count;
|
|
if (self.cq_ring_needs_flush() or wait_nr > 0) {
|
|
_ = try self.enter(0, wait_nr, .{ .GETEVENTS = true });
|
|
return self.copy_cqes_ready(cqes);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
fn copy_cqes_ready(self: *IoUring, cqes: []Cqe) u32 {
|
|
const ready = self.cq_ready();
|
|
const count = @min(cqes.len, ready);
|
|
const head = self.cq.head.* & self.cq.mask;
|
|
|
|
// before wrapping
|
|
const n = @min(self.cq.cqes.len - head, count);
|
|
@memcpy(cqes[0..n], self.cq.cqes[head..][0..n]);
|
|
|
|
if (count > n) {
|
|
// wrap self.cq.cqes
|
|
const w = count - n;
|
|
@memcpy(cqes[n..][0..w], self.cq.cqes[0..w]);
|
|
}
|
|
|
|
self.cq_advance(count);
|
|
return count;
|
|
}
|
|
|
|
/// Returns a copy of an I/O completion, waiting for it if necessary, and advancing the CQ ring.
|
|
/// A convenience method for `copy_cqes()` for when you don't need to batch or peek.
|
|
pub fn copy_cqe(ring: *IoUring) !Cqe {
|
|
var cqes: [1]Cqe = undefined;
|
|
while (true) {
|
|
const count = try ring.copy_cqes(&cqes, 1);
|
|
if (count > 0) return cqes[0];
|
|
}
|
|
}
|
|
|
|
/// Matches the implementation of cq_ring_needs_flush() in liburing.
|
|
pub fn cq_ring_needs_flush(self: *IoUring) bool {
|
|
return @atomicLoad(Sq.Flags, self.sq.flags, .unordered).CQ_OVERFLOW;
|
|
}
|
|
|
|
/// For advanced use cases only that implement custom completion queue methods.
|
|
/// If you use copy_cqes() or copy_cqe() you must not call cqe_seen() or cq_advance().
|
|
/// Must be called exactly once after a zero-copy CQE has been processed by your application.
|
|
/// Not idempotent, calling more than once will result in other CQEs being lost.
|
|
/// Matches the implementation of cqe_seen() in liburing.
|
|
pub fn cqe_seen(self: *IoUring, cqe: *Cqe) void {
|
|
_ = cqe;
|
|
self.cq_advance(1);
|
|
}
|
|
|
|
/// For advanced use cases only that implement custom completion queue methods.
|
|
/// Matches the implementation of cq_advance() in liburing.
|
|
pub fn cq_advance(self: *IoUring, count: u32) void {
|
|
if (count > 0) {
|
|
// Ensure the kernel only sees the new head value after the CQEs have been read.
|
|
@atomicStore(u32, self.cq.head, self.cq.head.* +% count, .release);
|
|
}
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an `fsync(2)`.
|
|
/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases.
|
|
/// For example, for `fdatasync()` you can set `IORING_FSYNC_DATASYNC` in the SQE's `rw_flags`.
|
|
/// N.B. While SQEs are initiated in the order in which they appear in the submission queue,
|
|
/// operations execute in parallel and completions are unordered. Therefore, an application that
|
|
/// submits a write followed by an fsync in the submission queue cannot expect the fsync to
|
|
/// apply to the write, since the fsync may complete before the write is issued to the disk.
|
|
/// You should preferably use `link_with_next_sqe()` on a write's SQE to link it with an fsync,
|
|
/// or else insert a full write barrier using `drain_previous_sqes()` when queueing an fsync.
|
|
pub fn fsync(self: *IoUring, user_data: u64, fd: posix.fd_t, flags: uflags.Fsync) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_fsync(fd, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a no-op.
|
|
/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases.
|
|
/// A no-op is more useful than may appear at first glance.
|
|
/// For example, you could call `drain_previous_sqes()` on the returned SQE, to use the no-op to
|
|
/// know when the ring is idle before acting on a kill signal.
|
|
pub fn nop(self: *IoUring, user_data: u64) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_nop();
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Used to select how the read should be handled.
|
|
pub const ReadBuffer = union(enum) {
|
|
/// io_uring will read directly into this buffer
|
|
buffer: []u8,
|
|
|
|
/// io_uring will read directly into these buffers using readv.
|
|
iovecs: []const posix.iovec,
|
|
|
|
/// io_uring will select a buffer that has previously been provided with `provide_buffers`.
|
|
/// The buffer group reference by `group_id` must contain at least one buffer for the read to work.
|
|
/// `len` controls the number of bytes to read into the selected buffer.
|
|
buffer_selection: struct {
|
|
group_id: u16,
|
|
len: usize,
|
|
},
|
|
};
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `read(2)` or `preadv(2)` depending on the buffer type.
|
|
/// * Reading into a `ReadBuffer.buffer` uses `read(2)`
|
|
/// * Reading into a `ReadBuffer.iovecs` uses `preadv(2)`
|
|
/// If you want to do a `preadv2(2)` then set `rw_flags` on the returned SQE. See https://man7.org/linux/man-pages/man2/preadv2.2.html
|
|
///
|
|
/// Returns a pointer to the SQE.
|
|
pub fn read(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: ReadBuffer,
|
|
offset: u64,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
switch (buffer) {
|
|
.buffer => |slice| sqe.prep_read(fd, slice, offset),
|
|
.iovecs => |vecs| sqe.prep_readv(fd, vecs, offset),
|
|
.buffer_selection => |selection| {
|
|
sqe.prep_rw(.READ, fd, 0, selection.len, offset);
|
|
sqe.flags.BUFFER_SELECT = true;
|
|
sqe.buf_index = selection.group_id;
|
|
},
|
|
}
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `write(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn write(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: []const u8,
|
|
offset: u64,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_write(fd, buffer, offset);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `splice(2)`
|
|
/// Either `fd_in` or `fd_out` must be a pipe.
|
|
/// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64).
|
|
/// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read
|
|
/// from `fd_in` starting from the file offset, which is incremented by the number of bytes read.
|
|
/// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`.
|
|
/// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first,
|
|
/// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally.
|
|
///
|
|
/// NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with EINVAL if one of the
|
|
/// fd doesn't explicitly support splice peration, e.g. reading from terminal is unsupported from kernel 5.7 to 5.11.
|
|
/// See https://github.com/axboe/liburing/issues/291
|
|
///
|
|
/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases.
|
|
pub fn splice(self: *IoUring, user_data: u64, fd_in: posix.fd_t, off_in: u64, fd_out: posix.fd_t, off_out: u64, len: usize) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_splice(fd_in, off_in, fd_out, off_out, len);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a IORING_OP_READ_FIXED.
|
|
/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first.
|
|
/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`.
|
|
///
|
|
/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases.
|
|
pub fn read_fixed(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: *posix.iovec,
|
|
offset: u64,
|
|
buffer_index: u16,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_read_fixed(fd, buffer, offset, buffer_index);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `pwritev()`.
|
|
/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases.
|
|
/// For example, if you want to do a `pwritev2()` then set `rw_flags` on the returned SQE.
|
|
/// See https://linux.die.net/man/2/pwritev.
|
|
pub fn writev(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
iovecs: []const posix.iovec_const,
|
|
offset: u64,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_writev(fd, iovecs, offset);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a IORING_OP_WRITE_FIXED.
|
|
/// The `buffer` provided must be registered with the kernel by calling `register_buffers` first.
|
|
/// The `buffer_index` must be the same as its index in the array provided to `register_buffers`.
|
|
///
|
|
/// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases.
|
|
pub fn write_fixed(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: *posix.iovec,
|
|
offset: u64,
|
|
buffer_index: u16,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_write_fixed(fd, buffer, offset, buffer_index);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an `accept4(2)` on a socket.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 5.5
|
|
pub fn accept(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
addr: ?*posix.sockaddr,
|
|
addrlen: ?*posix.socklen_t,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_accept(fd, addr, addrlen, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues an multishot accept on a socket.
|
|
///
|
|
/// Multishot variant allows an application to issue a single accept request,
|
|
/// which will repeatedly trigger a CQE when a connection request comes in.
|
|
/// While IORING_CQE_F_MORE flag is set in CQE flags accept will generate
|
|
/// further CQEs.
|
|
///
|
|
/// Available since 5.19
|
|
pub fn accept_multishot(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
addr: ?*posix.sockaddr,
|
|
addrlen: ?*posix.socklen_t,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_multishot_accept(fd, addr, addrlen, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues an accept using direct (registered) file descriptors.
|
|
///
|
|
/// To use an accept direct variant, the application must first have registered
|
|
/// a file table (with register_files). An unused table index will be
|
|
/// dynamically chosen and returned in the CQE res field.
|
|
///
|
|
/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE
|
|
/// flags member, and setting the SQE fd field to the direct descriptor value
|
|
/// rather than the regular file descriptor.
|
|
///
|
|
/// Available since 5.19
|
|
pub fn accept_direct(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
addr: ?*posix.sockaddr,
|
|
addrlen: ?*posix.socklen_t,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_accept_direct(fd, addr, addrlen, flags, constants.FILE_INDEX_ALLOC);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues an multishot accept using direct (registered) file descriptors.
|
|
/// Available since 5.19
|
|
pub fn accept_multishot_direct(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
addr: ?*posix.sockaddr,
|
|
addrlen: ?*posix.socklen_t,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_multishot_accept_direct(fd, addr, addrlen, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queue (but does not submit) an SQE to perform a `connect(2)` on a socket.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn connect(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
addr: *const posix.sockaddr,
|
|
addrlen: posix.socklen_t,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_connect(fd, addr, addrlen);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `epoll_ctl(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn epoll_ctl(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
epfd: linux.fd_t,
|
|
fd: linux.fd_t,
|
|
op: u32,
|
|
ev: ?*linux.epoll_event,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_epoll_ctl(epfd, fd, op, ev);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Used to select how the recv call should be handled.
|
|
pub const RecvBuffer = union(enum) {
|
|
/// io_uring will recv directly into this buffer
|
|
buffer: []u8,
|
|
|
|
/// io_uring will select a buffer that has previously been provided with `provide_buffers`.
|
|
/// The buffer group referenced by `group_id` must contain at least one buffer for the recv call to work.
|
|
/// `len` controls the number of bytes to read into the selected buffer.
|
|
buffer_selection: struct {
|
|
group_id: u16,
|
|
len: usize,
|
|
},
|
|
};
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `recv(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 5.6
|
|
pub fn recv(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: RecvBuffer,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
switch (buffer) {
|
|
.buffer => |slice| sqe.prep_recv(fd, slice, flags),
|
|
.buffer_selection => |selection| {
|
|
sqe.prep_rw(.RECV, fd, 0, selection.len, 0);
|
|
sqe.rw_flags = flags;
|
|
sqe.flags.BUFFER_SELECT = true;
|
|
sqe.buf_index = selection.group_id;
|
|
},
|
|
}
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `send(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 5.6
|
|
pub fn send(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: []const u8,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_send(fd, buffer, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`.
|
|
///
|
|
/// This operation will most likely produce two CQEs. The flags field of the
|
|
/// first cqe may likely contain IORING_CQE_F_MORE, which means that there will
|
|
/// be a second cqe with the user_data field set to the same value. The user
|
|
/// must not modify the data buffer until the notification is posted. The first
|
|
/// cqe follows the usual rules and so its res field will contain the number of
|
|
/// bytes sent or a negative error code. The notification's res field will be
|
|
/// set to zero and the flags field will contain IORING_CQE_F_NOTIF. The two
|
|
/// step model is needed because the kernel may hold on to buffers for a long
|
|
/// time, e.g. waiting for a TCP ACK. Notifications responsible for controlling
|
|
/// the lifetime of the buffers. Even errored requests may generate a
|
|
/// notification.
|
|
///
|
|
/// Available since 6.0
|
|
pub fn send_zc(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: []const u8,
|
|
send_flags: u32,
|
|
zc_flags: u16,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_send_zc(fd, buffer, send_flags, zc_flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an async zerocopy `send(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 6.0
|
|
pub fn send_zc_fixed(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
buffer: []const u8,
|
|
send_flags: u32,
|
|
zc_flags: u16,
|
|
buf_index: u16,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_send_zc_fixed(fd, buffer, send_flags, zc_flags, buf_index);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `recvmsg(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 5.3
|
|
pub fn recvmsg(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
msg: *linux.msghdr,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_recvmsg(fd, msg, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `sendmsg(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 5.3
|
|
pub fn sendmsg(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
msg: *const linux.msghdr_const,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_sendmsg(fd, msg, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an async zerocopy `sendmsg(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 6.1
|
|
pub fn sendmsg_zc(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
msg: *const linux.msghdr_const,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_sendmsg_zc(fd, msg, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an `openat(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 5.6.
|
|
pub fn openat(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
flags: linux.O,
|
|
mode: posix.mode_t,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_openat(fd, path, flags, mode);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues an openat using direct (registered) file descriptors.
|
|
///
|
|
/// To use an accept direct variant, the application must first have registered
|
|
/// a file table (with register_files). An unused table index will be
|
|
/// dynamically chosen and returned in the CQE res field.
|
|
///
|
|
/// After creation, they can be used by setting IOSQE_FIXED_FILE in the SQE
|
|
/// flags member, and setting the SQE fd field to the direct descriptor value
|
|
/// rather than the regular file descriptor.
|
|
///
|
|
/// Available since 5.15
|
|
pub fn openat_direct(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
flags: linux.O,
|
|
mode: posix.mode_t,
|
|
file_index: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_openat_direct(fd, path, flags, mode, file_index);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `close(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 5.6.
|
|
pub fn close(self: *IoUring, user_data: u64, fd: posix.fd_t) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_close(fd);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues close of registered file descriptor.
|
|
/// Available since 5.15
|
|
pub fn close_direct(self: *IoUring, user_data: u64, file_index: u32) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_close_direct(file_index);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to register a timeout operation.
|
|
/// Returns a pointer to the SQE.
|
|
///
|
|
/// The timeout will complete when either the timeout expires, or after the specified number of
|
|
/// events complete (if `count` is greater than `0`).
|
|
///
|
|
/// `flags` may be `0` for a relative timeout, or `IORING_TIMEOUT_ABS` for an absolute timeout.
|
|
///
|
|
/// The completion event result will be `-ETIME` if the timeout completed through expiration,
|
|
/// `0` if the timeout completed after the specified number of events, or `-ECANCELED` if the
|
|
/// timeout was removed before it expired.
|
|
///
|
|
/// io_uring timeouts use the `CLOCK.MONOTONIC` clock source.
|
|
pub fn timeout(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
ts: *const linux.kernel_timespec,
|
|
count: u32,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_timeout(ts, count, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to remove an existing timeout operation.
|
|
/// Returns a pointer to the SQE.
|
|
///
|
|
/// The timeout is identified by its `user_data`.
|
|
///
|
|
/// The completion event result will be `0` if the timeout was found and canceled successfully,
|
|
/// `-EBUSY` if the timeout was found but expiration was already in progress, or
|
|
/// `-ENOENT` if the timeout was not found.
|
|
pub fn timeout_remove(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
timeout_user_data: u64,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_timeout_remove(timeout_user_data, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to add a link timeout operation.
|
|
/// Returns a pointer to the SQE.
|
|
///
|
|
/// You need to set linux.IOSQE_IO_LINK to flags of the target operation
|
|
/// and then call this method right after the target operation.
|
|
/// See https://lwn.net/Articles/803932/ for detail.
|
|
///
|
|
/// If the dependent request finishes before the linked timeout, the timeout
|
|
/// is canceled. If the timeout finishes before the dependent request, the
|
|
/// dependent request will be canceled.
|
|
///
|
|
/// The completion event result of the link_timeout will be
|
|
/// `-ETIME` if the timeout finishes before the dependent request
|
|
/// (in this case, the completion event result of the dependent request will
|
|
/// be `-ECANCELED`), or
|
|
/// `-EALREADY` if the dependent request finishes before the linked timeout.
|
|
pub fn link_timeout(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
ts: *const linux.kernel_timespec,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_link_timeout(ts, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `poll(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn poll_add(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
poll_mask: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_poll_add(fd, poll_mask);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to remove an existing poll operation.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn poll_remove(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
target_user_data: u64,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_poll_remove(target_user_data);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to update the user data of an existing poll
|
|
/// operation. Returns a pointer to the SQE.
|
|
pub fn poll_update(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
old_user_data: u64,
|
|
new_user_data: u64,
|
|
poll_mask: u32,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_poll_update(old_user_data, new_user_data, poll_mask, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an `fallocate(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn fallocate(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
mode: i32,
|
|
offset: u64,
|
|
len: u64,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_fallocate(fd, mode, offset, len);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an `statx(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn statx(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
path: [:0]const u8,
|
|
flags: u32,
|
|
mask: u32,
|
|
buf: *linux.Statx,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_statx(fd, path, flags, mask, buf);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to remove an existing operation.
|
|
/// Returns a pointer to the SQE.
|
|
///
|
|
/// The operation is identified by its `user_data`.
|
|
///
|
|
/// The completion event result will be `0` if the operation was found and canceled successfully,
|
|
/// `-EALREADY` if the operation was found but was already in progress, or
|
|
/// `-ENOENT` if the operation was not found.
|
|
pub fn cancel(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
cancel_user_data: u64,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_cancel(cancel_user_data, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `shutdown(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
///
|
|
/// The operation is identified by its `user_data`.
|
|
pub fn shutdown(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
sockfd: posix.socket_t,
|
|
how: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_shutdown(sockfd, how);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `renameat2(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn renameat(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
old_dir_fd: linux.fd_t,
|
|
old_path: [*:0]const u8,
|
|
new_dir_fd: linux.fd_t,
|
|
new_path: [*:0]const u8,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_renameat(old_dir_fd, old_path, new_dir_fd, new_path, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `unlinkat(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn unlinkat(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
dir_fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_unlinkat(dir_fd, path, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `mkdirat(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn mkdirat(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
dir_fd: linux.fd_t,
|
|
path: [*:0]const u8,
|
|
mode: posix.mode_t,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_mkdirat(dir_fd, path, mode);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `symlinkat(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn symlinkat(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
target: [*:0]const u8,
|
|
new_dir_fd: linux.fd_t,
|
|
link_path: [*:0]const u8,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_symlinkat(target, new_dir_fd, link_path);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `linkat(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn linkat(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
old_dir_fd: linux.fd_t,
|
|
old_path: [*:0]const u8,
|
|
new_dir_fd: linux.fd_t,
|
|
new_path: [*:0]const u8,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_linkat(old_dir_fd, old_path, new_dir_fd, new_path, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to provide a group of buffers used for commands that read/receive data.
|
|
/// Returns a pointer to the SQE.
|
|
///
|
|
/// Provided buffers can be used in `read`, `recv` or `recvmsg` commands via .buffer_selection.
|
|
///
|
|
/// The kernel expects a contiguous block of memory of size (buffers_count * buffer_size).
|
|
pub fn provide_buffers(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
buffers: [*]u8,
|
|
buffer_size: usize,
|
|
buffers_count: usize,
|
|
group_id: usize,
|
|
buffer_id: usize,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_provide_buffers(buffers, buffer_size, buffers_count, group_id, buffer_id);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to remove a group of provided buffers.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn remove_buffers(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
buffers_count: usize,
|
|
group_id: usize,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_remove_buffers(buffers_count, group_id);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform a `waitid(2)`.
|
|
/// Returns a pointer to the SQE.
|
|
pub fn waitid(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
id_type: linux.P,
|
|
id: i32,
|
|
infop: *linux.siginfo_t,
|
|
options: u32,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_waitid(id_type, id, infop, options, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Registers an array of file descriptors.
|
|
/// Every time a file descriptor is put in an SQE and submitted to the kernel, the kernel must
|
|
/// retrieve a reference to the file, and once I/O has completed the file reference must be
|
|
/// dropped. The atomic nature of this file reference can be a slowdown for high IOPS workloads.
|
|
/// This slowdown can be avoided by pre-registering file descriptors.
|
|
/// To refer to a registered file descriptor, IOSQE_FIXED_FILE must be set in the SQE's flags,
|
|
/// and the SQE's fd must be set to the index of the file descriptor in the registered array.
|
|
/// Registering file descriptors will wait for the ring to idle.
|
|
/// Files are automatically unregistered by the kernel when the ring is torn down.
|
|
/// An application need unregister only if it wants to register a new array of file descriptors.
|
|
pub fn register_files(self: *IoUring, fds: []const linux.fd_t) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.REGISTER_FILES,
|
|
@as(*const anyopaque, @ptrCast(fds.ptr)),
|
|
@as(u32, @intCast(fds.len)),
|
|
);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
/// Updates registered file descriptors.
|
|
///
|
|
/// Updates are applied starting at the provided offset in the original file descriptors slice.
|
|
/// There are three kind of updates:
|
|
/// * turning a sparse entry (where the fd is -1) into a real one
|
|
/// * removing an existing entry (set the fd to -1)
|
|
/// * replacing an existing entry with a new fd
|
|
/// Adding new file descriptors must be done with `register_files`.
|
|
pub fn register_files_update(self: *IoUring, offset: u32, fds: []const linux.fd_t) !void {
|
|
assert(self.fd >= 0);
|
|
|
|
const FilesUpdate = extern struct {
|
|
offset: u32,
|
|
resv: u32,
|
|
fds: u64 align(8),
|
|
};
|
|
var update: FilesUpdate = .{
|
|
.offset = offset,
|
|
.resv = @as(u32, 0),
|
|
.fds = @as(u64, @intFromPtr(fds.ptr)),
|
|
};
|
|
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.REGISTER_FILES_UPDATE,
|
|
@as(*const anyopaque, @ptrCast(&update)),
|
|
@as(u32, @intCast(fds.len)),
|
|
);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
/// Registers an empty (-1) file table of `nr_files` number of file descriptors.
|
|
pub fn register_files_sparse(self: *IoUring, nr_files: u32) !void {
|
|
assert(self.fd >= 0);
|
|
|
|
const reg: RsrcRegister = .{
|
|
.nr = nr_files,
|
|
.flags = constants.RSRC_REGISTER_SPARSE,
|
|
.resv2 = 0,
|
|
.data = 0,
|
|
.tags = 0,
|
|
};
|
|
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.REGISTER_FILES2,
|
|
@ptrCast(®),
|
|
@as(u32, @sizeOf(linux.io_uring_rsrc_register)),
|
|
);
|
|
|
|
return handle_registration_result(res);
|
|
}
|
|
|
|
// Registers range for fixed file allocations.
|
|
// Available since 6.0
|
|
pub fn register_file_alloc_range(self: *IoUring, offset: u32, len: u32) !void {
|
|
assert(self.fd >= 0);
|
|
|
|
const range: FileIndexRange = .{
|
|
.off = offset,
|
|
.len = len,
|
|
.resv = 0,
|
|
};
|
|
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.REGISTER_FILE_ALLOC_RANGE,
|
|
@ptrCast(&range),
|
|
@as(u32, @sizeOf(linux.io_uring_file_index_range)),
|
|
);
|
|
|
|
return handle_registration_result(res);
|
|
}
|
|
|
|
/// Registers the file descriptor for an eventfd that will be notified of completion events on
|
|
/// an io_uring instance.
|
|
/// Only a single a eventfd can be registered at any given point in time.
|
|
pub fn register_eventfd(self: *IoUring, fd: linux.fd_t) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.REGISTER_EVENTFD,
|
|
@as(*const anyopaque, @ptrCast(&fd)),
|
|
1,
|
|
);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
/// Registers the file descriptor for an eventfd that will be notified of completion events on
|
|
/// an io_uring instance. Notifications are only posted for events that complete in an async manner.
|
|
/// This means that events that complete inline while being submitted do not trigger a notification event.
|
|
/// Only a single eventfd can be registered at any given point in time.
|
|
pub fn register_eventfd_async(self: *IoUring, fd: linux.fd_t) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.REGISTER_EVENTFD_ASYNC,
|
|
@as(*const anyopaque, @ptrCast(&fd)),
|
|
1,
|
|
);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
/// Unregister the registered eventfd file descriptor.
|
|
pub fn unregister_eventfd(self: *IoUring) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.UNREGISTER_EVENTFD,
|
|
null,
|
|
0,
|
|
);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
pub fn register_napi(self: *IoUring, napi: *Napi) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(self.fd, .REGISTER_NAPI, napi, 1);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
pub fn unregister_napi(self: *IoUring, napi: *Napi) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(self.fd, .UNREGISTER_NAPI, napi, 1);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
/// Registers an array of buffers for use with `read_fixed` and `write_fixed`.
|
|
pub fn register_buffers(self: *IoUring, buffers: []const posix.iovec) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(
|
|
self.fd,
|
|
.REGISTER_BUFFERS,
|
|
buffers.ptr,
|
|
@as(u32, @intCast(buffers.len)),
|
|
);
|
|
try handle_registration_result(res);
|
|
}
|
|
|
|
/// Unregister the registered buffers.
|
|
pub fn unregister_buffers(self: *IoUring) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(self.fd, .UNREGISTER_BUFFERS, null, 0);
|
|
switch (linux.errno(res)) {
|
|
.SUCCESS => {},
|
|
.NXIO => return error.BuffersNotRegistered,
|
|
else => |errno| return posix.unexpectedErrno(errno),
|
|
}
|
|
}
|
|
|
|
/// Returns a Probe which is used to probe the capabilities of the
|
|
/// io_uring subsystem of the running kernel. The Probe contains the
|
|
/// list of supported operations.
|
|
pub fn get_probe(self: *IoUring) !Probe {
|
|
var probe = mem.zeroInit(Probe, .{});
|
|
const res = linux.io_uring_register(self.fd, .REGISTER_PROBE, &probe, probe.ops.len);
|
|
try handle_register_buf_ring_result(res);
|
|
return probe;
|
|
}
|
|
|
|
fn handle_registration_result(res: usize) !void {
|
|
switch (linux.errno(res)) {
|
|
.SUCCESS => {},
|
|
// One or more fds in the array are invalid, or the kernel does not support sparse sets:
|
|
.BADF => return error.FileDescriptorInvalid,
|
|
.BUSY => return error.FilesAlreadyRegistered,
|
|
.INVAL => return error.FilesEmpty,
|
|
// Adding `nr_args` file references would exceed the maximum allowed number of files the
|
|
// user is allowed to have according to the per-user RLIMIT_NOFILE resource limit and
|
|
// the CAP_SYS_RESOURCE capability is not set, or `nr_args` exceeds the maximum allowed
|
|
// for a fixed file set (older kernels have a limit of 1024 files vs 64K files):
|
|
.MFILE => return error.UserFdQuotaExceeded,
|
|
// Insufficient kernel resources, or the caller had a non-zero RLIMIT_MEMLOCK soft
|
|
// resource limit but tried to lock more memory than the limit permitted (not enforced
|
|
// when the process is privileged with CAP_IPC_LOCK):
|
|
.NOMEM => return error.SystemResources,
|
|
// Attempt to register files on a ring already registering files or being torn down:
|
|
.NXIO => return error.RingShuttingDownOrAlreadyRegisteringFiles,
|
|
else => |errno| return posix.unexpectedErrno(errno),
|
|
}
|
|
}
|
|
|
|
/// Unregisters all registered file descriptors previously associated with the ring.
|
|
pub fn unregister_files(self: *IoUring) !void {
|
|
assert(self.fd >= 0);
|
|
const res = linux.io_uring_register(self.fd, .UNREGISTER_FILES, null, 0);
|
|
switch (linux.errno(res)) {
|
|
.SUCCESS => {},
|
|
.NXIO => return error.FilesNotRegistered,
|
|
else => |errno| return posix.unexpectedErrno(errno),
|
|
}
|
|
}
|
|
|
|
/// Prepares a socket creation request.
|
|
/// New socket fd will be returned in completion result.
|
|
/// Available since 5.19
|
|
pub fn socket(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
domain: u32,
|
|
socket_type: u32,
|
|
protocol: u32,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_socket(domain, socket_type, protocol, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Prepares a socket creation request for registered file at index `file_index`.
|
|
/// Available since 5.19
|
|
pub fn socket_direct(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
domain: u32,
|
|
socket_type: u32,
|
|
protocol: u32,
|
|
flags: u32,
|
|
file_index: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_socket_direct(domain, socket_type, protocol, flags, file_index);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Prepares a socket creation request for registered file, index chosen by kernel (file index alloc).
|
|
/// File index will be returned in CQE res field.
|
|
/// Available since 5.19
|
|
pub fn socket_direct_alloc(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
domain: u32,
|
|
socket_type: u32,
|
|
protocol: u32,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_socket_direct_alloc(domain, socket_type, protocol, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an `bind(2)` on a socket.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 6.11
|
|
pub fn bind(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
addr: *const posix.sockaddr,
|
|
addrlen: posix.socklen_t,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_bind(fd, addr, addrlen, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Queues (but does not submit) an SQE to perform an `listen(2)` on a socket.
|
|
/// Returns a pointer to the SQE.
|
|
/// Available since 6.11
|
|
pub fn listen(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
backlog: usize,
|
|
flags: u32,
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_listen(fd, backlog, flags);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Prepares an cmd request for a socket.
|
|
/// See: https://man7.org/linux/man-pages/man3/io_uring_prep_cmd.3.html
|
|
/// Available since 6.7.
|
|
pub fn cmd_sock(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
cmd_op: SocketOp,
|
|
fd: linux.fd_t,
|
|
level: u32, // linux.SOL
|
|
optname: u32, // linux.SO
|
|
optval: u64, // pointer to the option value
|
|
optlen: u32, // size of the option value
|
|
) !*Sqe {
|
|
const sqe = try self.get_sqe();
|
|
sqe.prep_cmd_sock(cmd_op, fd, level, optname, optval, optlen);
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
/// Prepares set socket option for the optname argument, at the protocol
|
|
/// level specified by the level argument.
|
|
/// Available since 6.7.n
|
|
pub fn setsockopt(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
level: u32, // linux.SOL
|
|
optname: u32, // linux.SO
|
|
opt: []const u8,
|
|
) !*Sqe {
|
|
return try self.cmd_sock(
|
|
user_data,
|
|
.SETSOCKOPT,
|
|
fd,
|
|
level,
|
|
optname,
|
|
@intFromPtr(opt.ptr),
|
|
@intCast(opt.len),
|
|
);
|
|
}
|
|
|
|
/// Prepares get socket option to retrieve the value for the option specified by
|
|
/// the option_name argument for the socket specified by the fd argument.
|
|
/// Available since 6.7.
|
|
pub fn getsockopt(
|
|
self: *IoUring,
|
|
user_data: u64,
|
|
fd: linux.fd_t,
|
|
level: u32, // linux.SOL
|
|
optname: u32, // linux.SO
|
|
opt: []u8,
|
|
) !*Sqe {
|
|
return try self.cmd_sock(
|
|
user_data,
|
|
.GETSOCKOPT,
|
|
fd,
|
|
level,
|
|
optname,
|
|
@intFromPtr(opt.ptr),
|
|
@intCast(opt.len),
|
|
);
|
|
}
|
|
|
|
/// matches io_uring_sq in liburing
|
|
pub const Sq = struct {
|
|
head: *u32,
|
|
tail: *u32,
|
|
mask: u32,
|
|
flags: *Flags,
|
|
dropped: *u32,
|
|
array: []u32,
|
|
sqes: []Sqe,
|
|
mmap: []align(page_size_min) u8,
|
|
mmap_sqes: []align(page_size_min) u8,
|
|
|
|
// We use `sqe_head` and `sqe_tail` in the same way as liburing:
|
|
// We increment `sqe_tail` (but not `tail`) for each call to `get_sqe()`.
|
|
// We then set `tail` to `sqe_tail` once, only when these events are actually submitted.
|
|
// This allows us to amortize the cost of the @atomicStore to `tail` across multiple SQEs.
|
|
sqe_head: u32 = 0,
|
|
sqe_tail: u32 = 0,
|
|
|
|
/// sq_ring.flags
|
|
pub const Flags = packed struct(u32) {
|
|
/// needs io_uring_enter wakeup
|
|
NEED_WAKEUP: bool = false,
|
|
/// CQ ring is overflown
|
|
CQ_OVERFLOW: bool = false,
|
|
/// task should enter the kernel
|
|
TASKRUN: bool = false,
|
|
_unused: u29 = 0,
|
|
};
|
|
|
|
pub fn init(fd: posix.fd_t, p: Params) !Sq {
|
|
assert(fd >= 0);
|
|
assert(p.features.SINGLE_MMAP);
|
|
const size = @max(
|
|
p.sq_off.array + p.sq_entries * @sizeOf(u32),
|
|
p.cq_off.cqes + p.cq_entries * @sizeOf(Cqe),
|
|
);
|
|
const mmap = try posix.mmap(
|
|
null,
|
|
size,
|
|
posix.PROT.READ | posix.PROT.WRITE,
|
|
.{ .TYPE = .SHARED, .POPULATE = true },
|
|
fd,
|
|
constants.OFF_SQ_RING,
|
|
);
|
|
errdefer posix.munmap(mmap);
|
|
assert(mmap.len == size);
|
|
|
|
// The motivation for the `sqes` and `array` indirection is to make it possible for the
|
|
// application to preallocate static io_uring_sqe entries and then replay them when needed.
|
|
const size_sqes = p.sq_entries * @sizeOf(Sqe);
|
|
const mmap_sqes = try posix.mmap(
|
|
null,
|
|
size_sqes,
|
|
posix.PROT.READ | posix.PROT.WRITE,
|
|
.{ .TYPE = .SHARED, .POPULATE = true },
|
|
fd,
|
|
constants.OFF_SQES,
|
|
);
|
|
errdefer posix.munmap(mmap_sqes);
|
|
assert(mmap_sqes.len == size_sqes);
|
|
|
|
const array: [*]u32 = @ptrCast(@alignCast(&mmap[p.sq_off.array]));
|
|
const sqes: [*]Sqe = @ptrCast(@alignCast(&mmap_sqes[0]));
|
|
// We expect the kernel copies p.sq_entries to the u32 pointed to by p.sq_off.ring_entries,
|
|
// see https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L7843-L7844.
|
|
assert(p.sq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_entries]))).*);
|
|
return .{
|
|
.head = @ptrCast(@alignCast(&mmap[p.sq_off.head])),
|
|
.tail = @ptrCast(@alignCast(&mmap[p.sq_off.tail])),
|
|
.mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.sq_off.ring_mask]))).*,
|
|
.flags = @ptrCast(@alignCast(&mmap[p.sq_off.flags])),
|
|
.dropped = @ptrCast(@alignCast(&mmap[p.sq_off.dropped])),
|
|
.array = array[0..p.sq_entries],
|
|
.sqes = sqes[0..p.sq_entries],
|
|
.mmap = mmap,
|
|
.mmap_sqes = mmap_sqes,
|
|
};
|
|
}
|
|
|
|
pub fn deinit(self: *Sq) void {
|
|
posix.munmap(self.mmap_sqes);
|
|
posix.munmap(self.mmap);
|
|
}
|
|
};
|
|
|
|
/// matches io_uring_cq in liburing
|
|
pub const Cq = struct {
|
|
head: *u32,
|
|
tail: *u32,
|
|
mask: u32,
|
|
overflow: *u32,
|
|
cqes: []Cqe,
|
|
|
|
/// cq_ring.flags
|
|
pub const Flags = packed struct(u32) {
|
|
/// disable eventfd notifications
|
|
EVENTFD_DISABLED: bool = false,
|
|
_unused: u31 = 0,
|
|
};
|
|
|
|
pub fn init(fd: posix.fd_t, p: Params, sq: Sq) !Cq {
|
|
assert(fd >= 0);
|
|
const features: uflags.Features = @bitCast(p.features);
|
|
assert(features.SINGLE_MMAP);
|
|
const mmap = sq.mmap;
|
|
const cqes: [*]Cqe = @ptrCast(@alignCast(&mmap[p.cq_off.cqes]));
|
|
assert(p.cq_entries == @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_entries]))).*);
|
|
return .{
|
|
.head = @ptrCast(@alignCast(&mmap[p.cq_off.head])),
|
|
.tail = @ptrCast(@alignCast(&mmap[p.cq_off.tail])),
|
|
.mask = @as(*u32, @ptrCast(@alignCast(&mmap[p.cq_off.ring_mask]))).*,
|
|
.overflow = @ptrCast(@alignCast(&mmap[p.cq_off.overflow])),
|
|
.cqes = cqes[0..p.cq_entries],
|
|
};
|
|
}
|
|
|
|
pub fn deinit(self: *Cq) void {
|
|
_ = self;
|
|
// A no-op since we now share the mmap with the submission queue.
|
|
// Here for symmetry with the submission queue, and for any future feature support.
|
|
}
|
|
};
|
|
|
|
/// Group of application provided buffers. Uses newer type, called ring mapped
|
|
/// buffers, supported since kernel 5.19. Buffers are identified by a buffer
|
|
/// group ID, and within that group, a buffer ID. IO_Uring can have multiple
|
|
/// buffer groups, each with unique group ID.
|
|
///
|
|
/// In `init` application provides contiguous block of memory `buffers` for
|
|
/// `buffers_count` buffers of size `buffers_size`. Application can then submit
|
|
/// `recv` operation without providing buffer upfront. Once the operation is
|
|
/// ready to receive data, a buffer is picked automatically and the resulting
|
|
/// CQE will contain the buffer ID in `cqe.buffer_id()`. Use `get` method to get
|
|
/// buffer for buffer ID identified by CQE. Once the application has processed
|
|
/// the buffer, it may hand ownership back to the kernel, by calling `put`
|
|
/// allowing the cycle to repeat.
|
|
///
|
|
/// Depending on the rate of arrival of data, it is possible that a given buffer
|
|
/// group will run out of buffers before those in CQEs can be put back to the
|
|
/// kernel. If this happens, a `cqe.err()` will have ENOBUFS as the error value.
|
|
///
|
|
pub const BufferGroup = struct {
|
|
/// Parent ring for which this group is registered.
|
|
ring: *IoUring,
|
|
/// Pointer to the memory shared by the kernel.
|
|
/// `buffers_count` of `io_uring_buf` structures are shared by the kernel.
|
|
/// First `io_uring_buf` is overlaid by `io_uring_buf_ring` struct.
|
|
br: *align(page_size_min) BufferRing,
|
|
/// Contiguous block of memory of size (buffers_count * buffer_size).
|
|
buffers: []u8,
|
|
/// Size of each buffer in buffers.
|
|
buffer_size: u32,
|
|
/// Number of buffers in `buffers`, number of `io_uring_buf structures` in br.
|
|
buffers_count: u16,
|
|
/// Head of unconsumed part of each buffer, if incremental consumption is enabled
|
|
heads: []u32,
|
|
/// ID of this group, must be unique in ring.
|
|
group_id: u16,
|
|
|
|
pub fn init(
|
|
ring: *IoUring,
|
|
allocator: mem.Allocator,
|
|
group_id: u16,
|
|
buffer_size: u32,
|
|
buffers_count: u16,
|
|
) !BufferGroup {
|
|
const buffers = try allocator.alloc(u8, buffer_size * buffers_count);
|
|
errdefer allocator.free(buffers);
|
|
const heads = try allocator.alloc(u32, buffers_count);
|
|
errdefer allocator.free(heads);
|
|
|
|
const br = try setup_buf_ring(ring.fd, buffers_count, group_id, .{ .inc = true });
|
|
buf_ring_init(br);
|
|
|
|
const mask = buf_ring_mask(buffers_count);
|
|
var i: u16 = 0;
|
|
while (i < buffers_count) : (i += 1) {
|
|
const pos = buffer_size * i;
|
|
const buf = buffers[pos .. pos + buffer_size];
|
|
heads[i] = 0;
|
|
buf_ring_add(br, buf, i, mask, i);
|
|
}
|
|
buf_ring_advance(br, buffers_count);
|
|
|
|
return BufferGroup{
|
|
.ring = ring,
|
|
.group_id = group_id,
|
|
.br = br,
|
|
.buffers = buffers,
|
|
.heads = heads,
|
|
.buffer_size = buffer_size,
|
|
.buffers_count = buffers_count,
|
|
};
|
|
}
|
|
|
|
pub fn deinit(self: *BufferGroup, allocator: mem.Allocator) void {
|
|
free_buf_ring(self.ring.fd, self.br, self.buffers_count, self.group_id);
|
|
allocator.free(self.buffers);
|
|
allocator.free(self.heads);
|
|
}
|
|
|
|
// Prepare recv operation which will select buffer from this group.
|
|
pub fn recv(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe {
|
|
var sqe = try self.ring.get_sqe();
|
|
sqe.prep_rw(.RECV, fd, 0, 0, 0);
|
|
sqe.rw_flags = flags;
|
|
sqe.flags.BUFFER_SELECT = true;
|
|
sqe.buf_index = self.group_id;
|
|
sqe.user_data = user_data;
|
|
return sqe;
|
|
}
|
|
|
|
// Prepare multishot recv operation which will select buffer from this group.
|
|
pub fn recv_multishot(self: *BufferGroup, user_data: u64, fd: posix.fd_t, flags: u32) !*Sqe {
|
|
var sqe = try self.recv(user_data, fd, flags);
|
|
sqe.ioprio.send_recv.RECV_MULTISHOT = true;
|
|
return sqe;
|
|
}
|
|
|
|
// Get buffer by id.
|
|
fn get_by_id(self: *BufferGroup, buffer_id: u16) []u8 {
|
|
const pos = self.buffer_size * buffer_id;
|
|
return self.buffers[pos .. pos + self.buffer_size][self.heads[buffer_id]..];
|
|
}
|
|
|
|
// Get buffer by CQE.
|
|
pub fn get(self: *BufferGroup, cqe: Cqe) ![]u8 {
|
|
const buffer_id = try cqe.buffer_id();
|
|
const used_len = @as(usize, @intCast(cqe.res));
|
|
return self.get_by_id(buffer_id)[0..used_len];
|
|
}
|
|
|
|
// Release buffer from CQE to the kernel.
|
|
pub fn put(self: *BufferGroup, cqe: Cqe) !void {
|
|
const buffer_id = try cqe.buffer_id();
|
|
if (cqe.flags.F_BUF_MORE) {
|
|
// Incremental consumption active, kernel will write to the this buffer again
|
|
const used_len = @as(u32, @intCast(cqe.res));
|
|
// Track what part of the buffer is used
|
|
self.heads[buffer_id] += used_len;
|
|
return;
|
|
}
|
|
self.heads[buffer_id] = 0;
|
|
|
|
// Release buffer to the kernel. const mask = buf_ring_mask(self.buffers_count);
|
|
const mask = buf_ring_mask(self.buffers_count);
|
|
buf_ring_add(self.br, self.get_by_id(buffer_id), buffer_id, mask, 0);
|
|
buf_ring_advance(self.br, 1);
|
|
}
|
|
};
|
|
|
|
/// Registers a shared buffer ring to be used with provided buffers.
|
|
/// `entries` number of `io_uring_buf` structures is mem mapped and shared by kernel.
|
|
/// `fd` is IO_Uring.fd for which the provided buffer ring is being registered.
|
|
/// `entries` is the number of entries requested in the buffer ring, must be power of 2.
|
|
/// `group_id` is the chosen buffer group ID, unique in IO_Uring.
|
|
pub fn setup_buf_ring(
|
|
fd: linux.fd_t,
|
|
entries: u16,
|
|
group_id: u16,
|
|
flags: BufferRegister.Flags,
|
|
) !*align(page_size_min) BufferRing {
|
|
if (entries == 0 or entries > 1 << 15) return error.EntriesNotInRange;
|
|
if (!std.math.isPowerOfTwo(entries)) return error.EntriesNotPowerOfTwo;
|
|
|
|
const mmap_size = @as(usize, entries) * @sizeOf(Buffer);
|
|
const mmap = try posix.mmap(
|
|
null,
|
|
mmap_size,
|
|
posix.PROT.READ | posix.PROT.WRITE,
|
|
.{ .TYPE = .PRIVATE, .ANONYMOUS = true },
|
|
-1,
|
|
0,
|
|
);
|
|
errdefer posix.munmap(mmap);
|
|
assert(mmap.len == mmap_size);
|
|
|
|
const br: *align(page_size_min) BufferRing = @ptrCast(mmap.ptr);
|
|
try register_buf_ring(fd, @intFromPtr(br), entries, group_id, flags);
|
|
return br;
|
|
}
|
|
|
|
fn register_buf_ring(
|
|
fd: linux.fd_t,
|
|
addr: u64,
|
|
entries: u32,
|
|
group_id: u16,
|
|
flags: BufferRegister.Flags,
|
|
) !void {
|
|
var reg = mem.zeroInit(BufferRegister, .{
|
|
.ring_addr = addr,
|
|
.ring_entries = entries,
|
|
.bgid = group_id,
|
|
.flags = flags,
|
|
});
|
|
var res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1);
|
|
if (linux.errno(res) == .INVAL and reg.flags.inc) {
|
|
// Retry without incremental buffer consumption.
|
|
// It is available since kernel 6.12. returns INVAL on older.
|
|
reg.flags.inc = false;
|
|
res = linux.io_uring_register(fd, .REGISTER_PBUF_RING, @as(*const anyopaque, @ptrCast(®)), 1);
|
|
}
|
|
try handle_register_buf_ring_result(res);
|
|
}
|
|
|
|
fn unregister_buf_ring(fd: posix.fd_t, group_id: u16) !void {
|
|
var reg = mem.zeroInit(BufferRegister, .{
|
|
.bgid = group_id,
|
|
});
|
|
const res = linux.io_uring_register(
|
|
fd,
|
|
.UNREGISTER_PBUF_RING,
|
|
@as(*const anyopaque, @ptrCast(®)),
|
|
1,
|
|
);
|
|
try handle_register_buf_ring_result(res);
|
|
}
|
|
|
|
fn handle_register_buf_ring_result(res: usize) !void {
|
|
switch (linux.errno(res)) {
|
|
.SUCCESS => {},
|
|
.INVAL => return error.ArgumentsInvalid,
|
|
else => |errno| return posix.unexpectedErrno(errno),
|
|
}
|
|
}
|
|
|
|
// Unregisters a previously registered shared buffer ring, returned from io_uring_setup_buf_ring.
|
|
pub fn free_buf_ring(fd: posix.fd_t, br: *align(page_size_min) BufferRing, entries: u32, group_id: u16) void {
|
|
unregister_buf_ring(fd, group_id) catch {};
|
|
var mmap: []align(page_size_min) u8 = undefined;
|
|
mmap.ptr = @ptrCast(br);
|
|
mmap.len = entries * @sizeOf(Buffer);
|
|
posix.munmap(mmap);
|
|
}
|
|
|
|
/// Initialises `br` so that it is ready to be used.
|
|
pub fn buf_ring_init(br: *BufferRing) void {
|
|
br.tail = 0;
|
|
}
|
|
|
|
/// Calculates the appropriate size mask for a buffer ring.
|
|
/// `entries` is the ring entries as specified in io_uring_register_buf_ring.
|
|
pub fn buf_ring_mask(entries: u16) u16 {
|
|
return entries - 1;
|
|
}
|
|
|
|
/// Assigns `buffer` with the `br` buffer ring.
|
|
/// `buffer_id` is identifier which will be returned in the CQE.
|
|
/// `buffer_offset` is the offset to insert at from the current tail.
|
|
/// If just one buffer is provided before the ring tail is committed with advance then offset should be 0.
|
|
/// If buffers are provided in a loop before being committed, the offset must be incremented by one for each buffer added.
|
|
pub fn buf_ring_add(
|
|
br: *BufferRing,
|
|
buffer: []u8,
|
|
buffer_id: u16,
|
|
mask: u16,
|
|
buffer_offset: u16,
|
|
) void {
|
|
const bufs: [*]Buffer = @ptrCast(br);
|
|
const buf: *Buffer = &bufs[(br.tail +% buffer_offset) & mask];
|
|
|
|
buf.addr = @intFromPtr(buffer.ptr);
|
|
buf.len = @intCast(buffer.len);
|
|
buf.bid = buffer_id;
|
|
}
|
|
|
|
/// Make `count` new buffers visible to the kernel. Called after
|
|
/// `io_uring_buf_ring_add` has been called `count` times to fill in new buffers.
|
|
pub fn buf_ring_advance(br: *BufferRing, count: u16) void {
|
|
const tail: u16 = br.tail +% count;
|
|
@atomicStore(u16, &br.tail, tail, .release);
|
|
}
|
|
|
|
test "structs/offsets/entries" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
try testing.expectEqual(@as(usize, 120), @sizeOf(Params));
|
|
try testing.expectEqual(@as(usize, 64), @sizeOf(Sqe));
|
|
try testing.expectEqual(@as(usize, 16), @sizeOf(Cqe));
|
|
|
|
try testing.expectEqual(0, constants.OFF_SQ_RING);
|
|
try testing.expectEqual(0x8000000, constants.OFF_CQ_RING);
|
|
try testing.expectEqual(0x10000000, constants.OFF_SQES);
|
|
|
|
try testing.expectError(error.EntriesZero, IoUring.init(0, .{}));
|
|
try testing.expectError(error.EntriesNotPowerOfTwo, IoUring.init(3, .{}));
|
|
}
|
|
|
|
test "nop" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer {
|
|
ring.deinit();
|
|
testing.expectEqual(@as(linux.fd_t, -1), ring.fd) catch @panic("test failed");
|
|
}
|
|
|
|
const sqe = try ring.nop(0xaaaaaaaa);
|
|
try testing.expectEqual(Sqe{
|
|
.opcode = .NOP,
|
|
.flags = 0,
|
|
.ioprio = 0,
|
|
.fd = 0,
|
|
.off = 0,
|
|
.addr = 0,
|
|
.len = 0,
|
|
.rw_flags = 0,
|
|
.user_data = 0xaaaaaaaa,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
}, sqe.*);
|
|
|
|
try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head);
|
|
try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail);
|
|
try testing.expectEqual(@as(u32, 0), ring.sq.tail.*);
|
|
try testing.expectEqual(@as(u32, 0), ring.cq.head.*);
|
|
try testing.expectEqual(@as(u32, 1), ring.sq_ready());
|
|
try testing.expectEqual(@as(u32, 0), ring.cq_ready());
|
|
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
try testing.expectEqual(@as(u32, 1), ring.sq.sqe_head);
|
|
try testing.expectEqual(@as(u32, 1), ring.sq.sqe_tail);
|
|
try testing.expectEqual(@as(u32, 1), ring.sq.tail.*);
|
|
try testing.expectEqual(@as(u32, 0), ring.cq.head.*);
|
|
try testing.expectEqual(@as(u32, 0), ring.sq_ready());
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xaaaaaaaa,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqual(@as(u32, 1), ring.cq.head.*);
|
|
try testing.expectEqual(@as(u32, 0), ring.cq_ready());
|
|
|
|
const sqe_barrier = try ring.nop(0xbbbbbbbb);
|
|
sqe_barrier.flags.IO_DRAIN = true;
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xbbbbbbbb,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqual(@as(u32, 2), ring.sq.sqe_head);
|
|
try testing.expectEqual(@as(u32, 2), ring.sq.sqe_tail);
|
|
try testing.expectEqual(@as(u32, 2), ring.sq.tail.*);
|
|
try testing.expectEqual(@as(u32, 2), ring.cq.head.*);
|
|
}
|
|
|
|
test "readv" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0);
|
|
defer posix.close(fd);
|
|
|
|
// Linux Kernel 5.4 supports IORING_REGISTER_FILES but not sparse fd sets (i.e. an fd of -1).
|
|
// Linux Kernel 5.5 adds support for sparse fd sets.
|
|
// Compare:
|
|
// https://github.com/torvalds/linux/blob/v5.4/fs/io_uring.c#L3119-L3124 vs
|
|
// https://github.com/torvalds/linux/blob/v5.8/fs/io_uring.c#L6687-L6691
|
|
// We therefore avoid stressing sparse fd sets here:
|
|
var registered_fds = [_]linux.fd_t{0} ** 1;
|
|
const fd_index = 0;
|
|
registered_fds[fd_index] = fd;
|
|
try ring.register_files(registered_fds[0..]);
|
|
|
|
var buffer = [_]u8{42} ** 128;
|
|
var iovecs = [_]posix.iovec{posix.iovec{ .base = &buffer, .len = buffer.len }};
|
|
const sqe = try ring.read(0xcccccccc, fd_index, .{ .iovecs = iovecs[0..] }, 0);
|
|
try testing.expectEqual(Op.READV, sqe.opcode);
|
|
sqe.flags.FIXED_FILE = true;
|
|
|
|
try testing.expectError(error.SubmissionQueueFull, ring.nop(0));
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xcccccccc,
|
|
.res = buffer.len,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]);
|
|
|
|
try ring.unregister_files();
|
|
}
|
|
|
|
test "writev/fsync/readv" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(4, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const path = "test_io_uring_writev_fsync_readv";
|
|
const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true });
|
|
defer file.close();
|
|
const fd = file.handle;
|
|
|
|
const buffer_write = [_]u8{42} ** 128;
|
|
const iovecs_write = [_]posix.iovec_const{
|
|
posix.iovec_const{ .base = &buffer_write, .len = buffer_write.len },
|
|
};
|
|
var buffer_read = [_]u8{0} ** 128;
|
|
var iovecs_read = [_]posix.iovec{
|
|
posix.iovec{ .base = &buffer_read, .len = buffer_read.len },
|
|
};
|
|
|
|
const sqe_writev = try ring.writev(0xdddddddd, fd, iovecs_write[0..], 17);
|
|
try testing.expectEqual(Op.WRITEV, sqe_writev.opcode);
|
|
try testing.expectEqual(@as(u64, 17), sqe_writev.off);
|
|
sqe_writev.flags.IO_LINK = true;
|
|
|
|
const sqe_fsync = try ring.fsync(0xeeeeeeee, fd, 0);
|
|
try testing.expectEqual(Op.FSYNC, sqe_fsync.opcode);
|
|
try testing.expectEqual(fd, sqe_fsync.fd);
|
|
sqe_fsync.flags.IO_LINK = true;
|
|
|
|
const sqe_readv = try ring.read(0xffffffff, fd, .{ .iovecs = iovecs_read[0..] }, 17);
|
|
try testing.expectEqual(Op.READV, sqe_readv.opcode);
|
|
try testing.expectEqual(@as(u64, 17), sqe_readv.off);
|
|
|
|
try testing.expectEqual(@as(u32, 3), ring.sq_ready());
|
|
try testing.expectEqual(@as(u32, 3), try ring.submit_and_wait(3));
|
|
try testing.expectEqual(@as(u32, 0), ring.sq_ready());
|
|
try testing.expectEqual(@as(u32, 3), ring.cq_ready());
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xdddddddd,
|
|
.res = buffer_write.len,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqual(@as(u32, 2), ring.cq_ready());
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xeeeeeeee,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqual(@as(u32, 1), ring.cq_ready());
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xffffffff,
|
|
.res = buffer_read.len,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqual(@as(u32, 0), ring.cq_ready());
|
|
|
|
try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]);
|
|
}
|
|
|
|
test "write/read" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
const path = "test_io_uring_write_read";
|
|
const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true });
|
|
defer file.close();
|
|
const fd = file.handle;
|
|
|
|
const buffer_write = [_]u8{97} ** 20;
|
|
var buffer_read = [_]u8{98} ** 20;
|
|
const sqe_write = try ring.write(0x11111111, fd, buffer_write[0..], 10);
|
|
try testing.expectEqual(Op.WRITE, sqe_write.opcode);
|
|
try testing.expectEqual(@as(u64, 10), sqe_write.off);
|
|
sqe_write.flags.IO_LINK = true;
|
|
const sqe_read = try ring.read(0x22222222, fd, .{ .buffer = buffer_read[0..] }, 10);
|
|
try testing.expectEqual(Op.READ, sqe_read.opcode);
|
|
try testing.expectEqual(@as(u64, 10), sqe_read.off);
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
|
|
const cqe_write = try ring.copy_cqe();
|
|
const cqe_read = try ring.copy_cqe();
|
|
// Prior to Linux Kernel 5.6 this is the only way to test for read/write support:
|
|
// https://lwn.net/Articles/809820/
|
|
if (cqe_write.err() == .INVAL) return error.SkipZigTest;
|
|
if (cqe_read.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x11111111,
|
|
.res = buffer_write.len,
|
|
.flags = 0,
|
|
}, cqe_write);
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x22222222,
|
|
.res = buffer_read.len,
|
|
.flags = 0,
|
|
}, cqe_read);
|
|
try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]);
|
|
}
|
|
|
|
test "splice/read" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(4, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
const path_src = "test_io_uring_splice_src";
|
|
const file_src = try tmp.dir.createFile(path_src, .{ .read = true, .truncate = true });
|
|
defer file_src.close();
|
|
const fd_src = file_src.handle;
|
|
|
|
const path_dst = "test_io_uring_splice_dst";
|
|
const file_dst = try tmp.dir.createFile(path_dst, .{ .read = true, .truncate = true });
|
|
defer file_dst.close();
|
|
const fd_dst = file_dst.handle;
|
|
|
|
const buffer_write = [_]u8{97} ** 20;
|
|
var buffer_read = [_]u8{98} ** 20;
|
|
_ = try file_src.write(&buffer_write);
|
|
|
|
const fds = try posix.pipe();
|
|
const pipe_offset: u64 = std.math.maxInt(u64);
|
|
|
|
const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len);
|
|
try testing.expectEqual(Op.SPLICE, sqe_splice_to_pipe.opcode);
|
|
try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr);
|
|
try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off);
|
|
sqe_splice_to_pipe.flags.IO_LINK = true;
|
|
|
|
const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len);
|
|
try testing.expectEqual(Op.SPLICE, sqe_splice_from_pipe.opcode);
|
|
try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr);
|
|
try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off);
|
|
sqe_splice_from_pipe.flags.IO_LINK = true;
|
|
|
|
const sqe_read = try ring.read(0x33333333, fd_dst, .{ .buffer = buffer_read[0..] }, 10);
|
|
try testing.expectEqual(Op.READ, sqe_read.opcode);
|
|
try testing.expectEqual(@as(u64, 10), sqe_read.off);
|
|
try testing.expectEqual(@as(u32, 3), try ring.submit());
|
|
|
|
const cqe_splice_to_pipe = try ring.copy_cqe();
|
|
const cqe_splice_from_pipe = try ring.copy_cqe();
|
|
const cqe_read = try ring.copy_cqe();
|
|
// Prior to Linux Kernel 5.6 this is the only way to test for splice/read support:
|
|
// https://lwn.net/Articles/809820/
|
|
if (cqe_splice_to_pipe.err() == .INVAL) return error.SkipZigTest;
|
|
if (cqe_splice_from_pipe.err() == .INVAL) return error.SkipZigTest;
|
|
if (cqe_read.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x11111111,
|
|
.res = buffer_write.len,
|
|
.flags = 0,
|
|
}, cqe_splice_to_pipe);
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x22222222,
|
|
.res = buffer_write.len,
|
|
.flags = 0,
|
|
}, cqe_splice_from_pipe);
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x33333333,
|
|
.res = buffer_read.len,
|
|
.flags = 0,
|
|
}, cqe_read);
|
|
try testing.expectEqualSlices(u8, buffer_write[0..], buffer_read[0..]);
|
|
}
|
|
|
|
test "write_fixed/read_fixed" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const path = "test_io_uring_write_read_fixed";
|
|
const file = try tmp.dir.createFile(path, .{ .read = true, .truncate = true });
|
|
defer file.close();
|
|
const fd = file.handle;
|
|
|
|
var raw_buffers: [2][11]u8 = undefined;
|
|
// First buffer will be written to the file.
|
|
@memset(&raw_buffers[0], 'z');
|
|
raw_buffers[0][0.."foobar".len].* = "foobar".*;
|
|
|
|
var buffers = [2]posix.iovec{
|
|
.{ .base = &raw_buffers[0], .len = raw_buffers[0].len },
|
|
.{ .base = &raw_buffers[1], .len = raw_buffers[1].len },
|
|
};
|
|
ring.register_buffers(&buffers) catch |err| switch (err) {
|
|
error.SystemResources => {
|
|
// See https://github.com/ziglang/zig/issues/15362
|
|
return error.SkipZigTest;
|
|
},
|
|
else => |e| return e,
|
|
};
|
|
|
|
const sqe_write = try ring.write_fixed(0x45454545, fd, &buffers[0], 3, 0);
|
|
try testing.expectEqual(Op.WRITE_FIXED, sqe_write.opcode);
|
|
try testing.expectEqual(@as(u64, 3), sqe_write.off);
|
|
sqe_write.flags.IO_LINK = true;
|
|
|
|
const sqe_read = try ring.read_fixed(0x12121212, fd, &buffers[1], 0, 1);
|
|
try testing.expectEqual(Op.READ_FIXED, sqe_read.opcode);
|
|
try testing.expectEqual(@as(u64, 0), sqe_read.off);
|
|
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
|
|
const cqe_write = try ring.copy_cqe();
|
|
const cqe_read = try ring.copy_cqe();
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x45454545,
|
|
.res = @as(i32, @intCast(buffers[0].len)),
|
|
.flags = 0,
|
|
}, cqe_write);
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x12121212,
|
|
.res = @as(i32, @intCast(buffers[1].len)),
|
|
.flags = 0,
|
|
}, cqe_read);
|
|
|
|
try testing.expectEqualSlices(u8, "\x00\x00\x00", buffers[1].base[0..3]);
|
|
try testing.expectEqualSlices(u8, "foobar", buffers[1].base[3..9]);
|
|
try testing.expectEqualSlices(u8, "zz", buffers[1].base[9..11]);
|
|
}
|
|
|
|
test "openat" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const path = "test_io_uring_openat";
|
|
|
|
// Workaround for LLVM bug: https://github.com/ziglang/zig/issues/12014
|
|
const path_addr = if (builtin.zig_backend == .stage2_llvm) p: {
|
|
var workaround = path;
|
|
_ = &workaround;
|
|
break :p @intFromPtr(workaround);
|
|
} else @intFromPtr(path);
|
|
|
|
const flags: linux.O = .{ .CLOEXEC = true, .ACCMODE = .RDWR, .CREAT = true };
|
|
const mode: posix.mode_t = 0o666;
|
|
const sqe_openat = try ring.openat(0x33333333, tmp.dir.fd, path, flags, mode);
|
|
try testing.expectEqual(Sqe{
|
|
.opcode = .OPENAT,
|
|
.flags = 0,
|
|
.ioprio = 0,
|
|
.fd = tmp.dir.fd,
|
|
.off = 0,
|
|
.addr = path_addr,
|
|
.len = mode,
|
|
.rw_flags = @bitCast(flags),
|
|
.user_data = 0x33333333,
|
|
.buf_index = 0,
|
|
.personality = 0,
|
|
.splice_fd_in = 0,
|
|
.addr3 = 0,
|
|
.resv = 0,
|
|
}, sqe_openat.*);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe_openat = try ring.copy_cqe();
|
|
try testing.expectEqual(@as(u64, 0x33333333), cqe_openat.user_data);
|
|
if (cqe_openat.err() == .INVAL) return error.SkipZigTest;
|
|
if (cqe_openat.err() == .BADF) return error.SkipZigTest;
|
|
if (cqe_openat.res <= 0) std.debug.print("\ncqe_openat.res={}\n", .{cqe_openat.res});
|
|
try testing.expect(cqe_openat.res > 0);
|
|
try testing.expectEqual(@as(u32, 0), cqe_openat.flags);
|
|
|
|
posix.close(cqe_openat.res);
|
|
}
|
|
|
|
test "close" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const path = "test_io_uring_close";
|
|
const file = try tmp.dir.createFile(path, .{});
|
|
errdefer file.close();
|
|
|
|
const sqe_close = try ring.close(0x44444444, file.handle);
|
|
try testing.expectEqual(Op.CLOSE, sqe_close.opcode);
|
|
try testing.expectEqual(file.handle, sqe_close.fd);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe_close = try ring.copy_cqe();
|
|
if (cqe_close.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x44444444,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe_close);
|
|
}
|
|
|
|
test "accept/connect/send/recv" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const socket_test_harness = try createSocketTestHarness(&ring);
|
|
defer socket_test_harness.close();
|
|
|
|
const buffer_send = [_]u8{ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
|
|
var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 };
|
|
|
|
const sqe_send = try ring.send(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0);
|
|
sqe_send.flags.IO_LINK = true;
|
|
_ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0);
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
|
|
const cqe_send = try ring.copy_cqe();
|
|
if (cqe_send.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xeeeeeeee,
|
|
.res = buffer_send.len,
|
|
.flags = 0,
|
|
}, cqe_send);
|
|
|
|
const cqe_recv = try ring.copy_cqe();
|
|
if (cqe_recv.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xffffffff,
|
|
.res = buffer_recv.len,
|
|
// ignore IORING_CQE_F_SOCK_NONEMPTY since it is only set on some systems
|
|
.flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY,
|
|
}, cqe_recv);
|
|
|
|
try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]);
|
|
}
|
|
|
|
test "sendmsg/recvmsg" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var address_server: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
|
|
const server = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0);
|
|
defer posix.close(server);
|
|
try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEPORT, &mem.toBytes(@as(c_int, 1)));
|
|
try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1)));
|
|
try posix.bind(server, addrAny(&address_server), @sizeOf(linux.sockaddr.in));
|
|
|
|
// set address_server to the OS-chosen IP/port.
|
|
var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in);
|
|
try posix.getsockname(server, addrAny(&address_server), &slen);
|
|
|
|
const client = try posix.socket(address_server.family, posix.SOCK.DGRAM, 0);
|
|
defer posix.close(client);
|
|
|
|
const buffer_send = [_]u8{42} ** 128;
|
|
const iovecs_send = [_]posix.iovec_const{
|
|
posix.iovec_const{ .base = &buffer_send, .len = buffer_send.len },
|
|
};
|
|
const msg_send: linux.msghdr_const = .{
|
|
.name = addrAny(&address_server),
|
|
.namelen = @sizeOf(linux.sockaddr.in),
|
|
.iov = &iovecs_send,
|
|
.iovlen = 1,
|
|
.control = null,
|
|
.controllen = 0,
|
|
.flags = 0,
|
|
};
|
|
const sqe_sendmsg = try ring.sendmsg(0x11111111, client, &msg_send, 0);
|
|
sqe_sendmsg.flags.IO_LINK = true;
|
|
try testing.expectEqual(Op.SENDMSG, sqe_sendmsg.opcode);
|
|
try testing.expectEqual(client, sqe_sendmsg.fd);
|
|
|
|
var buffer_recv = [_]u8{0} ** 128;
|
|
var iovecs_recv = [_]posix.iovec{
|
|
posix.iovec{ .base = &buffer_recv, .len = buffer_recv.len },
|
|
};
|
|
var address_recv: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = 0,
|
|
};
|
|
var msg_recv: linux.msghdr = .{
|
|
.name = addrAny(&address_recv),
|
|
.namelen = @sizeOf(linux.sockaddr.in),
|
|
.iov = &iovecs_recv,
|
|
.iovlen = 1,
|
|
.control = null,
|
|
.controllen = 0,
|
|
.flags = 0,
|
|
};
|
|
const sqe_recvmsg = try ring.recvmsg(0x22222222, server, &msg_recv, 0);
|
|
try testing.expectEqual(Op.RECVMSG, sqe_recvmsg.opcode);
|
|
try testing.expectEqual(server, sqe_recvmsg.fd);
|
|
|
|
try testing.expectEqual(@as(u32, 2), ring.sq_ready());
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit_and_wait(2));
|
|
try testing.expectEqual(@as(u32, 0), ring.sq_ready());
|
|
try testing.expectEqual(@as(u32, 2), ring.cq_ready());
|
|
|
|
const cqe_sendmsg = try ring.copy_cqe();
|
|
if (cqe_sendmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x11111111,
|
|
.res = buffer_send.len,
|
|
.flags = 0,
|
|
}, cqe_sendmsg);
|
|
|
|
const cqe_recvmsg = try ring.copy_cqe();
|
|
if (cqe_recvmsg.res == -@as(i32, @intFromEnum(linux.E.INVAL))) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x22222222,
|
|
.res = buffer_recv.len,
|
|
// ignore IORING_CQE_F_SOCK_NONEMPTY since it is set non-deterministically
|
|
.flags = cqe_recvmsg.flags & linux.IORING_CQE_F_SOCK_NONEMPTY,
|
|
}, cqe_recvmsg);
|
|
|
|
try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]);
|
|
}
|
|
|
|
test "timeout (after a relative time)" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
const io = std.testing.io;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const ms = 10;
|
|
const margin = 5;
|
|
const ts: linux.kernel_timespec = .{ .sec = 0, .nsec = ms * 1000000 };
|
|
|
|
const started = try std.Io.Clock.awake.now(io);
|
|
const sqe = try ring.timeout(0x55555555, &ts, 0, 0);
|
|
try testing.expectEqual(Op.TIMEOUT, sqe.opcode);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
const cqe = try ring.copy_cqe();
|
|
const stopped = try std.Io.Clock.awake.now(io);
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x55555555,
|
|
.res = -@as(i32, @intFromEnum(linux.E.TIME)),
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
// Tests should not depend on timings: skip test if outside margin.
|
|
const ms_elapsed = started.durationTo(stopped).toMilliseconds();
|
|
if (ms_elapsed > margin) return error.SkipZigTest;
|
|
}
|
|
|
|
test "timeout (after a number of completions)" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 };
|
|
const count_completions: u64 = 1;
|
|
const sqe_timeout = try ring.timeout(0x66666666, &ts, count_completions, 0);
|
|
try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode);
|
|
try testing.expectEqual(count_completions, sqe_timeout.off);
|
|
_ = try ring.nop(0x77777777);
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
|
|
const cqe_nop = try ring.copy_cqe();
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x77777777,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe_nop);
|
|
|
|
const cqe_timeout = try ring.copy_cqe();
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x66666666,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe_timeout);
|
|
}
|
|
|
|
test "timeout_remove" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const ts: linux.kernel_timespec = .{ .sec = 3, .nsec = 0 };
|
|
const sqe_timeout = try ring.timeout(0x88888888, &ts, 0, 0);
|
|
try testing.expectEqual(Op.TIMEOUT, sqe_timeout.opcode);
|
|
try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout.user_data);
|
|
|
|
const sqe_timeout_remove = try ring.timeout_remove(0x99999999, 0x88888888, 0);
|
|
try testing.expectEqual(Op.TIMEOUT_REMOVE, sqe_timeout_remove.opcode);
|
|
try testing.expectEqual(@as(u64, 0x88888888), sqe_timeout_remove.addr);
|
|
try testing.expectEqual(@as(u64, 0x99999999), sqe_timeout_remove.user_data);
|
|
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
|
|
// The order in which the CQE arrive is not clearly documented and it changed with kernel 5.18:
|
|
// * kernel 5.10 gives user data 0x88888888 first, 0x99999999 second
|
|
// * kernel 5.18 gives user data 0x99999999 first, 0x88888888 second
|
|
|
|
var cqes: [2]Cqe = undefined;
|
|
cqes[0] = try ring.copy_cqe();
|
|
cqes[1] = try ring.copy_cqe();
|
|
|
|
for (cqes) |cqe| {
|
|
// IORING_OP_TIMEOUT_REMOVE is not supported by this kernel version:
|
|
// Timeout remove operations set the fd to -1, which results in EBADF before EINVAL.
|
|
// We use IORING_FEAT_RW_CUR_POS as a safety check here to make sure we are at least pre-5.6.
|
|
// We don't want to skip this test for newer kernels.
|
|
if (cqe.user_data == 0x99999999 and
|
|
cqe.err() == .BADF and
|
|
(ring.features & linux.IORING_FEAT_RW_CUR_POS) == 0)
|
|
{
|
|
return error.SkipZigTest;
|
|
}
|
|
|
|
try testing.expect(cqe.user_data == 0x88888888 or cqe.user_data == 0x99999999);
|
|
|
|
if (cqe.user_data == 0x88888888) {
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x88888888,
|
|
.res = -@as(i32, @intFromEnum(linux.E.CANCELED)),
|
|
.flags = 0,
|
|
}, cqe);
|
|
} else if (cqe.user_data == 0x99999999) {
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x99999999,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
}
|
|
}
|
|
}
|
|
|
|
test "accept/connect/recv/link_timeout" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const socket_test_harness = try createSocketTestHarness(&ring);
|
|
defer socket_test_harness.close();
|
|
|
|
var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 };
|
|
|
|
const sqe_recv = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0);
|
|
sqe_recv.flags.IO_LINK = true;
|
|
|
|
const ts = linux.kernel_timespec{ .sec = 0, .nsec = 1000000 };
|
|
_ = try ring.link_timeout(0x22222222, &ts, 0);
|
|
|
|
const nr_wait = try ring.submit();
|
|
try testing.expectEqual(@as(u32, 2), nr_wait);
|
|
|
|
var i: usize = 0;
|
|
while (i < nr_wait) : (i += 1) {
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.user_data) {
|
|
0xffffffff => {
|
|
if (cqe.res != -@as(i32, @intFromEnum(linux.E.INTR)) and
|
|
cqe.res != -@as(i32, @intFromEnum(linux.E.CANCELED)))
|
|
{
|
|
std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res });
|
|
try testing.expect(false);
|
|
}
|
|
},
|
|
0x22222222 => {
|
|
if (cqe.res != -@as(i32, @intFromEnum(linux.E.ALREADY)) and
|
|
cqe.res != -@as(i32, @intFromEnum(linux.E.TIME)))
|
|
{
|
|
std.debug.print("Req 0x{x} got {d}\n", .{ cqe.user_data, cqe.res });
|
|
try testing.expect(false);
|
|
}
|
|
},
|
|
else => @panic("should not happen"),
|
|
}
|
|
}
|
|
}
|
|
|
|
test "fallocate" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const path = "test_io_uring_fallocate";
|
|
const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 });
|
|
defer file.close();
|
|
|
|
try testing.expectEqual(@as(u64, 0), (try file.stat()).size);
|
|
|
|
const len: u64 = 65536;
|
|
const sqe = try ring.fallocate(0xaaaaaaaa, file.handle, 0, 0, len);
|
|
try testing.expectEqual(Op.FALLOCATE, sqe.opcode);
|
|
try testing.expectEqual(file.handle, sqe.fd);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement fallocate():
|
|
.INVAL => return error.SkipZigTest,
|
|
// This kernel does not implement fallocate():
|
|
.NOSYS => return error.SkipZigTest,
|
|
// The filesystem containing the file referred to by fd does not support this operation;
|
|
// or the mode is not supported by the filesystem containing the file referred to by fd:
|
|
.OPNOTSUPP => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xaaaaaaaa,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
try testing.expectEqual(len, (try file.stat()).size);
|
|
}
|
|
|
|
test "statx" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
const path = "test_io_uring_statx";
|
|
const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 });
|
|
defer file.close();
|
|
|
|
try testing.expectEqual(@as(u64, 0), (try file.stat()).size);
|
|
|
|
try file.writeAll("foobar");
|
|
|
|
var buf: linux.Statx = undefined;
|
|
const sqe = try ring.statx(
|
|
0xaaaaaaaa,
|
|
tmp.dir.fd,
|
|
path,
|
|
0,
|
|
linux.STATX_SIZE,
|
|
&buf,
|
|
);
|
|
try testing.expectEqual(Op.STATX, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement statx():
|
|
.INVAL => return error.SkipZigTest,
|
|
// This kernel does not implement statx():
|
|
.NOSYS => return error.SkipZigTest,
|
|
// The filesystem containing the file referred to by fd does not support this operation;
|
|
// or the mode is not supported by the filesystem containing the file referred to by fd:
|
|
.OPNOTSUPP => return error.SkipZigTest,
|
|
// not supported on older kernels (5.4)
|
|
.BADF => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xaaaaaaaa,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
try testing.expect(buf.mask & linux.STATX_SIZE == linux.STATX_SIZE);
|
|
try testing.expectEqual(@as(u64, 6), buf.size);
|
|
}
|
|
|
|
test "accept/connect/recv/cancel" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const socket_test_harness = try createSocketTestHarness(&ring);
|
|
defer socket_test_harness.close();
|
|
|
|
var buffer_recv = [_]u8{ 0, 1, 0, 1, 0 };
|
|
|
|
_ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const sqe_cancel = try ring.cancel(0x99999999, 0xffffffff, 0);
|
|
try testing.expectEqual(Op.ASYNC_CANCEL, sqe_cancel.opcode);
|
|
try testing.expectEqual(@as(u64, 0xffffffff), sqe_cancel.addr);
|
|
try testing.expectEqual(@as(u64, 0x99999999), sqe_cancel.user_data);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
var cqe_recv = try ring.copy_cqe();
|
|
if (cqe_recv.err() == .INVAL) return error.SkipZigTest;
|
|
var cqe_cancel = try ring.copy_cqe();
|
|
if (cqe_cancel.err() == .INVAL) return error.SkipZigTest;
|
|
|
|
// The recv/cancel CQEs may arrive in any order, the recv CQE will sometimes come first:
|
|
if (cqe_recv.user_data == 0x99999999 and cqe_cancel.user_data == 0xffffffff) {
|
|
const a = cqe_recv;
|
|
const b = cqe_cancel;
|
|
cqe_recv = b;
|
|
cqe_cancel = a;
|
|
}
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xffffffff,
|
|
.res = -@as(i32, @intFromEnum(linux.E.CANCELED)),
|
|
.flags = 0,
|
|
}, cqe_recv);
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x99999999,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe_cancel);
|
|
}
|
|
|
|
test "register_files_update" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0);
|
|
defer posix.close(fd);
|
|
|
|
var registered_fds = [_]linux.fd_t{0} ** 2;
|
|
const fd_index = 0;
|
|
const fd_index2 = 1;
|
|
registered_fds[fd_index] = fd;
|
|
registered_fds[fd_index2] = -1;
|
|
|
|
ring.register_files(registered_fds[0..]) catch |err| switch (err) {
|
|
// Happens when the kernel doesn't support sparse entry (-1) in the file descriptors array.
|
|
error.FileDescriptorInvalid => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
};
|
|
|
|
// Test IORING_REGISTER_FILES_UPDATE
|
|
// Only available since Linux 5.5
|
|
|
|
const fd2 = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0);
|
|
defer posix.close(fd2);
|
|
|
|
registered_fds[fd_index] = fd2;
|
|
registered_fds[fd_index2] = -1;
|
|
try ring.register_files_update(0, registered_fds[0..]);
|
|
|
|
var buffer = [_]u8{42} ** 128;
|
|
{
|
|
const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0);
|
|
try testing.expectEqual(Op.READ, sqe.opcode);
|
|
sqe.flags.FIXED_FILE = true;
|
|
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xcccccccc,
|
|
.res = buffer.len,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]);
|
|
}
|
|
|
|
// Test with a non-zero offset
|
|
|
|
registered_fds[fd_index] = -1;
|
|
registered_fds[fd_index2] = -1;
|
|
try ring.register_files_update(1, registered_fds[1..]);
|
|
|
|
{
|
|
// Next read should still work since fd_index in the registered file descriptors hasn't been updated yet.
|
|
const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0);
|
|
try testing.expectEqual(Op.READ, sqe.opcode);
|
|
sqe.flags |= linux.IOSQE_FIXED_FILE;
|
|
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xcccccccc,
|
|
.res = buffer.len,
|
|
.flags = 0,
|
|
}, try ring.copy_cqe());
|
|
try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer.len), buffer[0..]);
|
|
}
|
|
|
|
try ring.register_files_update(0, registered_fds[0..]);
|
|
|
|
{
|
|
// Now this should fail since both fds are sparse (-1)
|
|
const sqe = try ring.read(0xcccccccc, fd_index, .{ .buffer = &buffer }, 0);
|
|
try testing.expectEqual(Op.READ, sqe.opcode);
|
|
sqe.flags |= linux.IOSQE_FIXED_FILE;
|
|
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(linux.E.BADF, cqe.err());
|
|
}
|
|
|
|
try ring.unregister_files();
|
|
}
|
|
|
|
test "shutdown" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var address: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
|
|
// Socket bound, expect shutdown to work
|
|
{
|
|
const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
defer posix.close(server);
|
|
try posix.setsockopt(server, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1)));
|
|
try posix.bind(server, addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
try posix.listen(server, 1);
|
|
|
|
// set address to the OS-chosen IP/port.
|
|
var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in);
|
|
try posix.getsockname(server, addrAny(&address), &slen);
|
|
|
|
const shutdown_sqe = try ring.shutdown(0x445445445, server, linux.SHUT.RD);
|
|
try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode);
|
|
try testing.expectEqual(@as(i32, server), shutdown_sqe.fd);
|
|
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement shutdown (kernel version < 5.11)
|
|
.INVAL => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x445445445,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
}
|
|
|
|
// Socket not bound, expect to fail with ENOTCONN
|
|
{
|
|
const server = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
defer posix.close(server);
|
|
|
|
const shutdown_sqe = ring.shutdown(0x445445445, server, linux.SHUT.RD) catch |err| switch (err) {
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
};
|
|
try testing.expectEqual(Op.SHUTDOWN, shutdown_sqe.opcode);
|
|
try testing.expectEqual(@as(i32, server), shutdown_sqe.fd);
|
|
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(@as(u64, 0x445445445), cqe.user_data);
|
|
try testing.expectEqual(linux.E.NOTCONN, cqe.err());
|
|
}
|
|
}
|
|
|
|
test "renameat" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const old_path = "test_io_uring_renameat_old";
|
|
const new_path = "test_io_uring_renameat_new";
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
// Write old file with data
|
|
|
|
const old_file = try tmp.dir.createFile(old_path, .{ .truncate = true, .mode = 0o666 });
|
|
defer old_file.close();
|
|
try old_file.writeAll("hello");
|
|
|
|
// Submit renameat
|
|
|
|
const sqe = try ring.renameat(
|
|
0x12121212,
|
|
tmp.dir.fd,
|
|
old_path,
|
|
tmp.dir.fd,
|
|
new_path,
|
|
0,
|
|
);
|
|
try testing.expectEqual(Op.RENAMEAT, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len)));
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement renameat (kernel version < 5.11)
|
|
.BADF, .INVAL => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x12121212,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
// Validate that the old file doesn't exist anymore
|
|
try testing.expectError(error.FileNotFound, tmp.dir.openFile(old_path, .{}));
|
|
|
|
// Validate that the new file exists with the proper content
|
|
var new_file_data: [16]u8 = undefined;
|
|
try testing.expectEqualStrings("hello", try tmp.dir.readFile(new_path, &new_file_data));
|
|
}
|
|
|
|
test "unlinkat" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const path = "test_io_uring_unlinkat";
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
// Write old file with data
|
|
|
|
const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 });
|
|
defer file.close();
|
|
|
|
// Submit unlinkat
|
|
|
|
const sqe = try ring.unlinkat(
|
|
0x12121212,
|
|
tmp.dir.fd,
|
|
path,
|
|
0,
|
|
);
|
|
try testing.expectEqual(Op.UNLINKAT, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement unlinkat (kernel version < 5.11)
|
|
.BADF, .INVAL => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x12121212,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
// Validate that the file doesn't exist anymore
|
|
_ = tmp.dir.openFile(path, .{}) catch |err| switch (err) {
|
|
error.FileNotFound => {},
|
|
else => std.debug.panic("unexpected error: {}", .{err}),
|
|
};
|
|
}
|
|
|
|
test "mkdirat" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const path = "test_io_uring_mkdirat";
|
|
|
|
// Submit mkdirat
|
|
|
|
const sqe = try ring.mkdirat(
|
|
0x12121212,
|
|
tmp.dir.fd,
|
|
path,
|
|
0o0755,
|
|
);
|
|
try testing.expectEqual(Op.MKDIRAT, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement mkdirat (kernel version < 5.15)
|
|
.BADF, .INVAL => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x12121212,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
// Validate that the directory exist
|
|
_ = try tmp.dir.openDir(path, .{});
|
|
}
|
|
|
|
test "symlinkat" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const path = "test_io_uring_symlinkat";
|
|
const link_path = "test_io_uring_symlinkat_link";
|
|
|
|
const file = try tmp.dir.createFile(path, .{ .truncate = true, .mode = 0o666 });
|
|
defer file.close();
|
|
|
|
// Submit symlinkat
|
|
|
|
const sqe = try ring.symlinkat(
|
|
0x12121212,
|
|
path,
|
|
tmp.dir.fd,
|
|
link_path,
|
|
);
|
|
try testing.expectEqual(Op.SYMLINKAT, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement symlinkat (kernel version < 5.15)
|
|
.BADF, .INVAL => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x12121212,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
// Validate that the symlink exist
|
|
_ = try tmp.dir.openFile(link_path, .{});
|
|
}
|
|
|
|
test "linkat" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
|
|
const first_path = "test_io_uring_linkat_first";
|
|
const second_path = "test_io_uring_linkat_second";
|
|
|
|
// Write file with data
|
|
|
|
const first_file = try tmp.dir.createFile(first_path, .{ .truncate = true, .mode = 0o666 });
|
|
defer first_file.close();
|
|
try first_file.writeAll("hello");
|
|
|
|
// Submit linkat
|
|
|
|
const sqe = try ring.linkat(
|
|
0x12121212,
|
|
tmp.dir.fd,
|
|
first_path,
|
|
tmp.dir.fd,
|
|
second_path,
|
|
0,
|
|
);
|
|
try testing.expectEqual(Op.LINKAT, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), sqe.fd);
|
|
try testing.expectEqual(@as(i32, tmp.dir.fd), @as(i32, @bitCast(sqe.len)));
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
// This kernel's io_uring does not yet implement linkat (kernel version < 5.15)
|
|
.BADF, .INVAL => return error.SkipZigTest,
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0x12121212,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe);
|
|
|
|
// Validate the second file
|
|
var second_file_data: [16]u8 = undefined;
|
|
try testing.expectEqualStrings("hello", try tmp.dir.readFile(second_path, &second_file_data));
|
|
}
|
|
|
|
test "provide_buffers: read" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0);
|
|
defer posix.close(fd);
|
|
|
|
const group_id = 1337;
|
|
const buffer_id = 0;
|
|
|
|
const buffer_len = 128;
|
|
|
|
var buffers: [4][buffer_len]u8 = undefined;
|
|
|
|
// Provide 4 buffers
|
|
|
|
{
|
|
const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id);
|
|
try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, buffers.len), sqe.fd);
|
|
try testing.expectEqual(@as(u32, buffers[0].len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
// Happens when the kernel is < 5.7
|
|
.INVAL, .BADF => return error.SkipZigTest,
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data);
|
|
}
|
|
|
|
// Do 4 reads which should consume all buffers
|
|
|
|
var i: usize = 0;
|
|
while (i < buffers.len) : (i += 1) {
|
|
const sqe = try ring.read(0xdededede, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(Op.READ, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, fd), sqe.fd);
|
|
try testing.expectEqual(@as(u64, 0), sqe.addr);
|
|
try testing.expectEqual(@as(u32, buffer_len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
|
|
try testing.expect(cqe.flags.F_BUFFER);
|
|
const used_buffer_id = cqe.flags >> 16;
|
|
try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3);
|
|
try testing.expectEqual(@as(i32, buffer_len), cqe.res);
|
|
|
|
try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data);
|
|
try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]);
|
|
}
|
|
|
|
// This read should fail
|
|
|
|
{
|
|
const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(Op.READ, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, fd), sqe.fd);
|
|
try testing.expectEqual(@as(u64, 0), sqe.addr);
|
|
try testing.expectEqual(@as(u32, buffer_len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
// Expected
|
|
.NOBUFS => {},
|
|
.SUCCESS => std.debug.panic("unexpected success", .{}),
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data);
|
|
}
|
|
|
|
// Provide 1 buffer again
|
|
|
|
// Deliberately put something we don't expect in the buffers
|
|
@memset(mem.sliceAsBytes(&buffers), 42);
|
|
|
|
const reprovided_buffer_id = 2;
|
|
|
|
{
|
|
_ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
}
|
|
|
|
// Final read which should work
|
|
|
|
{
|
|
const sqe = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(Op.READ, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, fd), sqe.fd);
|
|
try testing.expectEqual(@as(u64, 0), sqe.addr);
|
|
try testing.expectEqual(@as(u32, buffer_len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
|
|
try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER);
|
|
const used_buffer_id = cqe.flags >> 16;
|
|
try testing.expectEqual(used_buffer_id, reprovided_buffer_id);
|
|
try testing.expectEqual(@as(i32, buffer_len), cqe.res);
|
|
try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data);
|
|
try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]);
|
|
}
|
|
}
|
|
|
|
test "remove_buffers" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const fd = try posix.openZ("/dev/zero", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0);
|
|
defer posix.close(fd);
|
|
|
|
const group_id = 1337;
|
|
const buffer_id = 0;
|
|
|
|
const buffer_len = 128;
|
|
|
|
var buffers: [4][buffer_len]u8 = undefined;
|
|
|
|
// Provide 4 buffers
|
|
|
|
{
|
|
_ = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.INVAL, .BADF => return error.SkipZigTest,
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data);
|
|
}
|
|
|
|
// Remove 3 buffers
|
|
|
|
{
|
|
const sqe = try ring.remove_buffers(0xbababababa, 3, group_id);
|
|
try testing.expectEqual(Op.REMOVE_BUFFERS, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, 3), sqe.fd);
|
|
try testing.expectEqual(@as(u64, 0), sqe.addr);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(@as(u64, 0xbababababa), cqe.user_data);
|
|
}
|
|
|
|
// This read should work
|
|
|
|
{
|
|
_ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
|
|
try testing.expect(cqe.flags.F_BUFFER);
|
|
const used_buffer_id = cqe.flags >> 16;
|
|
try testing.expect(used_buffer_id >= 0 and used_buffer_id < 4);
|
|
try testing.expectEqual(@as(i32, buffer_len), cqe.res);
|
|
try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data);
|
|
try testing.expectEqualSlices(u8, &([_]u8{0} ** buffer_len), buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))]);
|
|
}
|
|
|
|
// Final read should _not_ work
|
|
|
|
{
|
|
_ = try ring.read(0xdfdfdfdf, fd, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
// Expected
|
|
.NOBUFS => {},
|
|
.SUCCESS => std.debug.panic("unexpected success", .{}),
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
}
|
|
}
|
|
|
|
test "provide_buffers: accept/connect/send/recv" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const group_id = 1337;
|
|
const buffer_id = 0;
|
|
|
|
const buffer_len = 128;
|
|
var buffers: [4][buffer_len]u8 = undefined;
|
|
|
|
// Provide 4 buffers
|
|
|
|
{
|
|
const sqe = try ring.provide_buffers(0xcccccccc, @as([*]u8, @ptrCast(&buffers)), buffer_len, buffers.len, group_id, buffer_id);
|
|
try testing.expectEqual(Op.PROVIDE_BUFFERS, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, buffers.len), sqe.fd);
|
|
try testing.expectEqual(@as(u32, buffer_len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
// Happens when the kernel is < 5.7
|
|
.INVAL => return error.SkipZigTest,
|
|
// Happens on the kernel 5.4
|
|
.BADF => return error.SkipZigTest,
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(@as(u64, 0xcccccccc), cqe.user_data);
|
|
}
|
|
|
|
const socket_test_harness = try createSocketTestHarness(&ring);
|
|
defer socket_test_harness.close();
|
|
|
|
// Do 4 send on the socket
|
|
|
|
{
|
|
var i: usize = 0;
|
|
while (i < buffers.len) : (i += 1) {
|
|
_ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'z'} ** buffer_len), 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
}
|
|
|
|
var cqes: [4]Cqe = undefined;
|
|
try testing.expectEqual(@as(u32, 4), try ring.copy_cqes(&cqes, 4));
|
|
}
|
|
|
|
// Do 4 recv which should consume all buffers
|
|
|
|
// Deliberately put something we don't expect in the buffers
|
|
@memset(mem.sliceAsBytes(&buffers), 1);
|
|
|
|
var i: usize = 0;
|
|
while (i < buffers.len) : (i += 1) {
|
|
const sqe = try ring.recv(0xdededede, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(Op.RECV, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd);
|
|
try testing.expectEqual(@as(u64, 0), sqe.addr);
|
|
try testing.expectEqual(@as(u32, buffer_len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 0), sqe.rw_flags);
|
|
try testing.expectEqual(.{ .BUFFER_SELECT = true }, sqe.flags);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
|
|
try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER);
|
|
const used_buffer_id = cqe.flags >> 16;
|
|
try testing.expect(used_buffer_id >= 0 and used_buffer_id <= 3);
|
|
try testing.expectEqual(@as(i32, buffer_len), cqe.res);
|
|
|
|
try testing.expectEqual(@as(u64, 0xdededede), cqe.user_data);
|
|
const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))];
|
|
try testing.expectEqualSlices(u8, &([_]u8{'z'} ** buffer_len), buffer);
|
|
}
|
|
|
|
// This recv should fail
|
|
|
|
{
|
|
const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(Op.RECV, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd);
|
|
try testing.expectEqual(@as(u64, 0), sqe.addr);
|
|
try testing.expectEqual(@as(u32, buffer_len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 0), sqe.rw_flags);
|
|
try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
// Expected
|
|
.NOBUFS => {},
|
|
.SUCCESS => std.debug.panic("unexpected success", .{}),
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data);
|
|
}
|
|
|
|
// Provide 1 buffer again
|
|
|
|
const reprovided_buffer_id = 2;
|
|
|
|
{
|
|
_ = try ring.provide_buffers(0xabababab, @as([*]u8, @ptrCast(&buffers[reprovided_buffer_id])), buffer_len, 1, group_id, reprovided_buffer_id);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
}
|
|
|
|
// Redo 1 send on the server socket
|
|
|
|
{
|
|
_ = try ring.send(0xdeaddead, socket_test_harness.server, &([_]u8{'w'} ** buffer_len), 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
_ = try ring.copy_cqe();
|
|
}
|
|
|
|
// Final recv which should work
|
|
|
|
// Deliberately put something we don't expect in the buffers
|
|
@memset(mem.sliceAsBytes(&buffers), 1);
|
|
|
|
{
|
|
const sqe = try ring.recv(0xdfdfdfdf, socket_test_harness.client, .{ .buffer_selection = .{ .group_id = group_id, .len = buffer_len } }, 0);
|
|
try testing.expectEqual(Op.RECV, sqe.opcode);
|
|
try testing.expectEqual(@as(i32, socket_test_harness.client), sqe.fd);
|
|
try testing.expectEqual(@as(u64, 0), sqe.addr);
|
|
try testing.expectEqual(@as(u32, buffer_len), sqe.len);
|
|
try testing.expectEqual(@as(u16, group_id), sqe.buf_index);
|
|
try testing.expectEqual(@as(u32, 0), sqe.rw_flags);
|
|
try testing.expectEqual(@as(u32, linux.IOSQE_BUFFER_SELECT), sqe.flags);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
const cqe = try ring.copy_cqe();
|
|
switch (cqe.err()) {
|
|
.SUCCESS => {},
|
|
else => |errno| std.debug.panic("unhandled errno: {}", .{errno}),
|
|
}
|
|
|
|
try testing.expect(cqe.flags & linux.IORING_CQE_F_BUFFER == linux.IORING_CQE_F_BUFFER);
|
|
const used_buffer_id = cqe.flags >> 16;
|
|
try testing.expectEqual(used_buffer_id, reprovided_buffer_id);
|
|
try testing.expectEqual(@as(i32, buffer_len), cqe.res);
|
|
try testing.expectEqual(@as(u64, 0xdfdfdfdf), cqe.user_data);
|
|
const buffer = buffers[used_buffer_id][0..@as(usize, @intCast(cqe.res))];
|
|
try testing.expectEqualSlices(u8, &([_]u8{'w'} ** buffer_len), buffer);
|
|
}
|
|
}
|
|
|
|
/// Used for testing server/client interactions.
|
|
const SocketTestHarness = struct {
|
|
listener: posix.socket_t,
|
|
server: posix.socket_t,
|
|
client: posix.socket_t,
|
|
|
|
fn close(self: SocketTestHarness) void {
|
|
posix.close(self.client);
|
|
posix.close(self.listener);
|
|
}
|
|
};
|
|
|
|
fn createSocketTestHarness(ring: *IoUring) !SocketTestHarness {
|
|
// Create a TCP server socket
|
|
var address: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
const listener_socket = try createListenerSocket(&address);
|
|
errdefer posix.close(listener_socket);
|
|
|
|
// Submit 1 accept
|
|
var accept_addr: posix.sockaddr = undefined;
|
|
var accept_addr_len: posix.socklen_t = @sizeOf(@TypeOf(accept_addr));
|
|
_ = try ring.accept(0xaaaaaaaa, listener_socket, &accept_addr, &accept_addr_len, 0);
|
|
|
|
// Create a TCP client socket
|
|
const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
errdefer posix.close(client);
|
|
_ = try ring.connect(0xcccccccc, client, addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
|
|
var cqe_accept = try ring.copy_cqe();
|
|
if (cqe_accept.err() == .INVAL) return error.SkipZigTest;
|
|
var cqe_connect = try ring.copy_cqe();
|
|
if (cqe_connect.err() == .INVAL) return error.SkipZigTest;
|
|
|
|
// The accept/connect CQEs may arrive in any order, the connect CQE will sometimes come first:
|
|
if (cqe_accept.user_data == 0xcccccccc and cqe_connect.user_data == 0xaaaaaaaa) {
|
|
const a = cqe_accept;
|
|
const b = cqe_connect;
|
|
cqe_accept = b;
|
|
cqe_connect = a;
|
|
}
|
|
|
|
try testing.expectEqual(@as(u64, 0xaaaaaaaa), cqe_accept.user_data);
|
|
if (cqe_accept.res <= 0) std.debug.print("\ncqe_accept.res={}\n", .{cqe_accept.res});
|
|
try testing.expect(cqe_accept.res > 0);
|
|
try testing.expectEqual(@as(u32, 0), cqe_accept.flags);
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xcccccccc,
|
|
.res = 0,
|
|
.flags = 0,
|
|
}, cqe_connect);
|
|
|
|
// All good
|
|
|
|
return SocketTestHarness{
|
|
.listener = listener_socket,
|
|
.server = cqe_accept.res,
|
|
.client = client,
|
|
};
|
|
}
|
|
|
|
fn createListenerSocket(address: *linux.sockaddr.in) !posix.socket_t {
|
|
const kernel_backlog = 1;
|
|
const listener_socket = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
errdefer posix.close(listener_socket);
|
|
|
|
try posix.setsockopt(listener_socket, posix.SOL.SOCKET, posix.SO.REUSEADDR, &mem.toBytes(@as(c_int, 1)));
|
|
try posix.bind(listener_socket, addrAny(address), @sizeOf(linux.sockaddr.in));
|
|
try posix.listen(listener_socket, kernel_backlog);
|
|
|
|
// set address to the OS-chosen IP/port.
|
|
var slen: posix.socklen_t = @sizeOf(linux.sockaddr.in);
|
|
try posix.getsockname(listener_socket, addrAny(address), &slen);
|
|
|
|
return listener_socket;
|
|
}
|
|
|
|
test "accept multishot" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var address: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
const listener_socket = try createListenerSocket(&address);
|
|
defer posix.close(listener_socket);
|
|
|
|
// submit multishot accept operation
|
|
var addr: posix.sockaddr = undefined;
|
|
var addr_len: posix.socklen_t = @sizeOf(@TypeOf(addr));
|
|
const userdata: u64 = 0xaaaaaaaa;
|
|
_ = try ring.accept_multishot(userdata, listener_socket, &addr, &addr_len, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
var nr: usize = 4; // number of clients to connect
|
|
while (nr > 0) : (nr -= 1) {
|
|
// connect client
|
|
const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
errdefer posix.close(client);
|
|
try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
|
|
// test accept completion
|
|
var cqe = try ring.copy_cqe();
|
|
if (cqe.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expect(cqe.res > 0);
|
|
try testing.expect(cqe.user_data == userdata);
|
|
try testing.expect(cqe.flags.F_MORE); // more flag is set
|
|
|
|
posix.close(client);
|
|
}
|
|
}
|
|
|
|
test "accept/connect/send_zc/recv" {
|
|
try skipKernelLessThan(.{ .major = 6, .minor = 0, .patch = 0 });
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const socket_test_harness = try createSocketTestHarness(&ring);
|
|
defer socket_test_harness.close();
|
|
|
|
const buffer_send = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe };
|
|
var buffer_recv = [_]u8{0} ** 10;
|
|
|
|
// zero-copy send
|
|
const sqe_send = try ring.send_zc(0xeeeeeeee, socket_test_harness.client, buffer_send[0..], 0, 0);
|
|
sqe_send.flags.IO_LINK = true;
|
|
_ = try ring.recv(0xffffffff, socket_test_harness.server, .{ .buffer = buffer_recv[0..] }, 0);
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
|
|
var cqe_send = try ring.copy_cqe();
|
|
// First completion of zero-copy send.
|
|
// IORING_CQE_F_MORE, means that there
|
|
// will be a second completion event / notification for the
|
|
// request, with the user_data field set to the same value.
|
|
// buffer_send must be keep alive until second cqe.
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xeeeeeeee,
|
|
.res = buffer_send.len,
|
|
.flags = .{ .F_MORE = true },
|
|
}, cqe_send);
|
|
|
|
cqe_send, const cqe_recv = brk: {
|
|
const cqe1 = try ring.copy_cqe();
|
|
const cqe2 = try ring.copy_cqe();
|
|
break :brk if (cqe1.user_data == 0xeeeeeeee) .{ cqe1, cqe2 } else .{ cqe2, cqe1 };
|
|
};
|
|
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xffffffff,
|
|
.res = buffer_recv.len,
|
|
.flags = cqe_recv.flags & linux.IORING_CQE_F_SOCK_NONEMPTY,
|
|
}, cqe_recv);
|
|
try testing.expectEqualSlices(u8, buffer_send[0..buffer_recv.len], buffer_recv[0..]);
|
|
|
|
// Second completion of zero-copy send.
|
|
// IORING_CQE_F_NOTIF in flags signals that kernel is done with send_buffer
|
|
try testing.expectEqual(Cqe{
|
|
.user_data = 0xeeeeeeee,
|
|
.res = 0,
|
|
.flags = linux.IORING_CQE_F_NOTIF,
|
|
}, cqe_send);
|
|
}
|
|
|
|
test "accept_direct" {
|
|
try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 });
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
var address: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
|
|
// register direct file descriptors
|
|
var registered_fds = [_]linux.fd_t{-1} ** 2;
|
|
try ring.register_files(registered_fds[0..]);
|
|
|
|
const listener_socket = try createListenerSocket(&address);
|
|
defer posix.close(listener_socket);
|
|
|
|
const accept_userdata: u64 = 0xaaaaaaaa;
|
|
const read_userdata: u64 = 0xbbbbbbbb;
|
|
const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe };
|
|
|
|
for (0..2) |_| {
|
|
for (registered_fds, 0..) |_, i| {
|
|
var buffer_recv = [_]u8{0} ** 16;
|
|
const buffer_send: []const u8 = data[0 .. data.len - i]; // make it different at each loop
|
|
|
|
// submit accept, will chose registered fd and return index in cqe
|
|
_ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
// connect
|
|
const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
defer posix.close(client);
|
|
|
|
// accept completion
|
|
const cqe_accept = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err());
|
|
const fd_index = cqe_accept.res;
|
|
try testing.expect(fd_index < registered_fds.len);
|
|
try testing.expect(cqe_accept.user_data == accept_userdata);
|
|
|
|
// send data
|
|
_ = try posix.send(client, buffer_send, 0);
|
|
|
|
// Example of how to use registered fd:
|
|
// Submit receive to fixed file returned by accept (fd_index).
|
|
// Fd field is set to registered file index, returned by accept.
|
|
// Flag linux.IOSQE_FIXED_FILE must be set.
|
|
const recv_sqe = try ring.recv(read_userdata, fd_index, .{ .buffer = &buffer_recv }, 0);
|
|
recv_sqe.flags.FIXED_FILE = true;
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
// accept receive
|
|
const recv_cqe = try ring.copy_cqe();
|
|
try testing.expect(recv_cqe.user_data == read_userdata);
|
|
try testing.expect(recv_cqe.res == buffer_send.len);
|
|
try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]);
|
|
}
|
|
// no more available fds, accept will get NFILE error
|
|
{
|
|
// submit accept
|
|
_ = try ring.accept_direct(accept_userdata, listener_socket, null, null, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
// connect
|
|
const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
defer posix.close(client);
|
|
// completion with error
|
|
const cqe_accept = try ring.copy_cqe();
|
|
try testing.expect(cqe_accept.user_data == accept_userdata);
|
|
try testing.expectEqual(posix.E.NFILE, cqe_accept.err());
|
|
}
|
|
// return file descriptors to kernel
|
|
try ring.register_files_update(0, registered_fds[0..]);
|
|
}
|
|
try ring.unregister_files();
|
|
}
|
|
|
|
test "accept_multishot_direct" {
|
|
try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 });
|
|
|
|
if (builtin.cpu.arch == .riscv64) {
|
|
// https://github.com/ziglang/zig/issues/25734
|
|
return error.SkipZigTest;
|
|
}
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var address: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
|
|
var registered_fds = [_]linux.fd_t{-1} ** 2;
|
|
try ring.register_files(registered_fds[0..]);
|
|
|
|
const listener_socket = try createListenerSocket(&address);
|
|
defer posix.close(listener_socket);
|
|
|
|
const accept_userdata: u64 = 0xaaaaaaaa;
|
|
|
|
for (0..2) |_| {
|
|
// submit multishot accept
|
|
// Will chose registered fd and return index of the selected registered file in cqe.
|
|
_ = try ring.accept_multishot_direct(accept_userdata, listener_socket, null, null, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
for (registered_fds) |_| {
|
|
// connect
|
|
const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
defer posix.close(client);
|
|
|
|
// accept completion
|
|
const cqe_accept = try ring.copy_cqe();
|
|
const fd_index = cqe_accept.res;
|
|
try testing.expect(fd_index < registered_fds.len);
|
|
try testing.expect(cqe_accept.user_data == accept_userdata);
|
|
try testing.expect(cqe_accept.flags.F_MORE); // has more is set
|
|
}
|
|
// No more available fds, accept will get NFILE error.
|
|
// Multishot is terminated (more flag is not set).
|
|
{
|
|
// connect
|
|
const client = try posix.socket(address.family, posix.SOCK.STREAM | posix.SOCK.CLOEXEC, 0);
|
|
try posix.connect(client, addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
defer posix.close(client);
|
|
// completion with error
|
|
const cqe_accept = try ring.copy_cqe();
|
|
try testing.expect(cqe_accept.user_data == accept_userdata);
|
|
try testing.expectEqual(posix.E.NFILE, cqe_accept.err());
|
|
try testing.expect(cqe_accept.flags & linux.IORING_CQE_F_MORE == 0); // has more is not set
|
|
}
|
|
// return file descriptors to kernel
|
|
try ring.register_files_update(0, registered_fds[0..]);
|
|
}
|
|
try ring.unregister_files();
|
|
}
|
|
|
|
test "socket" {
|
|
try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 });
|
|
|
|
var ring = IoUring.init(1, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
// prepare, submit socket operation
|
|
_ = try ring.socket(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
// test completion
|
|
var cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
const fd: linux.fd_t = @intCast(cqe.res);
|
|
try testing.expect(fd > 2);
|
|
|
|
posix.close(fd);
|
|
}
|
|
|
|
test "socket_direct/socket_direct_alloc/close_direct" {
|
|
try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 });
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var registered_fds = [_]linux.fd_t{-1} ** 3;
|
|
try ring.register_files(registered_fds[0..]);
|
|
|
|
// create socket in registered file descriptor at index 0 (last param)
|
|
_ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
var cqe_socket = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err());
|
|
try testing.expect(cqe_socket.res == 0);
|
|
|
|
// create socket in registered file descriptor at index 1 (last param)
|
|
_ = try ring.socket_direct(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0, 1);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
cqe_socket = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err());
|
|
try testing.expect(cqe_socket.res == 0); // res is 0 when index is specified
|
|
|
|
// create socket in kernel chosen file descriptor index (_alloc version)
|
|
// completion res has index from registered files
|
|
_ = try ring.socket_direct_alloc(0, linux.AF.INET, posix.SOCK.STREAM, 0, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
cqe_socket = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_socket.err());
|
|
try testing.expect(cqe_socket.res == 2); // returns registered file index
|
|
|
|
// use sockets from registered_fds in connect operation
|
|
var address: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
const listener_socket = try createListenerSocket(&address);
|
|
defer posix.close(listener_socket);
|
|
const accept_userdata: u64 = 0xaaaaaaaa;
|
|
const connect_userdata: u64 = 0xbbbbbbbb;
|
|
const close_userdata: u64 = 0xcccccccc;
|
|
for (registered_fds, 0..) |_, fd_index| {
|
|
// prepare accept
|
|
_ = try ring.accept(accept_userdata, listener_socket, null, null, 0);
|
|
// prepare connect with fixed socket
|
|
const connect_sqe = try ring.connect(connect_userdata, @intCast(fd_index), addrAny(&address), @sizeOf(linux.sockaddr.in));
|
|
connect_sqe.flags |= linux.IOSQE_FIXED_FILE; // fd is fixed file index
|
|
// submit both
|
|
try testing.expectEqual(@as(u32, 2), try ring.submit());
|
|
// get completions
|
|
var cqe_connect = try ring.copy_cqe();
|
|
var cqe_accept = try ring.copy_cqe();
|
|
// ignore order
|
|
if (cqe_connect.user_data == accept_userdata and cqe_accept.user_data == connect_userdata) {
|
|
const a = cqe_accept;
|
|
const b = cqe_connect;
|
|
cqe_accept = b;
|
|
cqe_connect = a;
|
|
}
|
|
// test connect completion
|
|
try testing.expect(cqe_connect.user_data == connect_userdata);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_connect.err());
|
|
// test accept completion
|
|
try testing.expect(cqe_accept.user_data == accept_userdata);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_accept.err());
|
|
|
|
// submit and test close_direct
|
|
_ = try ring.close_direct(close_userdata, @intCast(fd_index));
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
var cqe_close = try ring.copy_cqe();
|
|
try testing.expect(cqe_close.user_data == close_userdata);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_close.err());
|
|
}
|
|
|
|
try ring.unregister_files();
|
|
}
|
|
|
|
test "openat_direct/close_direct" {
|
|
try skipKernelLessThan(.{ .major = 5, .minor = 19, .patch = 0 });
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
var registered_fds = [_]linux.fd_t{-1} ** 3;
|
|
try ring.register_files(registered_fds[0..]);
|
|
|
|
var tmp = std.testing.tmpDir(.{});
|
|
defer tmp.cleanup();
|
|
const path = "test_io_uring_close_direct";
|
|
const flags: linux.O = .{ .ACCMODE = .RDWR, .CREAT = true };
|
|
const mode: posix.mode_t = 0o666;
|
|
const user_data: u64 = 0;
|
|
|
|
// use registered file at index 0 (last param)
|
|
_ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
var cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expect(cqe.res == 0);
|
|
|
|
// use registered file at index 1
|
|
_ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, 1);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expect(cqe.res == 0); // res is 0 when we specify index
|
|
|
|
// let kernel choose registered file index
|
|
_ = try ring.openat_direct(user_data, tmp.dir.fd, path, flags, mode, linux.IORING_FILE_INDEX_ALLOC);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expect(cqe.res == 2); // chosen index is in res
|
|
|
|
// close all open file descriptors
|
|
for (registered_fds, 0..) |_, fd_index| {
|
|
_ = try ring.close_direct(user_data, @intCast(fd_index));
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
var cqe_close = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe_close.err());
|
|
}
|
|
try ring.unregister_files();
|
|
}
|
|
|
|
test "waitid" {
|
|
try skipKernelLessThan(.{ .major = 6, .minor = 7, .patch = 0 });
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const pid = try posix.fork();
|
|
if (pid == 0) {
|
|
posix.exit(7);
|
|
}
|
|
|
|
var siginfo: posix.siginfo_t = undefined;
|
|
_ = try ring.waitid(0, .PID, pid, &siginfo, posix.W.EXITED, 0);
|
|
|
|
try testing.expectEqual(1, try ring.submit());
|
|
|
|
const cqe_waitid = try ring.copy_cqe();
|
|
try testing.expectEqual(0, cqe_waitid.res);
|
|
try testing.expectEqual(pid, siginfo.fields.common.first.piduid.pid);
|
|
try testing.expectEqual(7, siginfo.fields.common.second.sigchld.status);
|
|
}
|
|
|
|
/// For use in tests. Returns SkipZigTest if kernel version is less than required.
|
|
inline fn skipKernelLessThan(required: std.SemanticVersion) !void {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var uts: linux.utsname = undefined;
|
|
const res = linux.uname(&uts);
|
|
switch (linux.errno(res)) {
|
|
.SUCCESS => {},
|
|
else => |errno| return posix.unexpectedErrno(errno),
|
|
}
|
|
|
|
const release = mem.sliceTo(&uts.release, 0);
|
|
// Strips potential extra, as kernel version might not be semver compliant, example "6.8.9-300.fc40.x86_64"
|
|
const extra_index = std.mem.indexOfAny(u8, release, "-+");
|
|
const stripped = release[0..(extra_index orelse release.len)];
|
|
// Make sure the input don't rely on the extra we just stripped
|
|
try testing.expect(required.pre == null and required.build == null);
|
|
|
|
var current = try std.SemanticVersion.parse(stripped);
|
|
current.pre = null; // don't check pre field
|
|
if (required.order(current) == .gt) return error.SkipZigTest;
|
|
}
|
|
|
|
test BufferGroup {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
// Init IoUring
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
// Init buffer group for ring
|
|
const group_id: u16 = 1; // buffers group id
|
|
const buffers_count: u16 = 1; // number of buffers in buffer group
|
|
const buffer_size: usize = 128; // size of each buffer in group
|
|
var buf_grp = BufferGroup.init(
|
|
&ring,
|
|
testing.allocator,
|
|
group_id,
|
|
buffer_size,
|
|
buffers_count,
|
|
) catch |err| switch (err) {
|
|
// kernel older than 5.19
|
|
error.ArgumentsInvalid => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer buf_grp.deinit(testing.allocator);
|
|
|
|
// Create client/server fds
|
|
const fds = try createSocketTestHarness(&ring);
|
|
defer fds.close();
|
|
const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe };
|
|
|
|
// Client sends data
|
|
{
|
|
_ = try ring.send(1, fds.client, data[0..], 0);
|
|
const submitted = try ring.submit();
|
|
try testing.expectEqual(1, submitted);
|
|
const cqe_send = try ring.copy_cqe();
|
|
if (cqe_send.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{ .user_data = 1, .res = data.len, .flags = 0 }, cqe_send);
|
|
}
|
|
|
|
// Server uses buffer group receive
|
|
{
|
|
// Submit recv operation, buffer will be chosen from buffer group
|
|
_ = try buf_grp.recv(2, fds.server, 0);
|
|
const submitted = try ring.submit();
|
|
try testing.expectEqual(1, submitted);
|
|
|
|
// ... when we have completion for recv operation
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(2, cqe.user_data); // matches submitted user_data
|
|
try testing.expect(cqe.res >= 0); // success
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expectEqual(data.len, @as(usize, @intCast(cqe.res))); // cqe.res holds received data len
|
|
|
|
// Get buffer from pool
|
|
const buf = try buf_grp.get(cqe);
|
|
try testing.expectEqualSlices(u8, &data, buf);
|
|
// Release buffer to the kernel when application is done with it
|
|
try buf_grp.put(cqe);
|
|
}
|
|
}
|
|
|
|
test "ring mapped buffers recv" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
// init buffer group
|
|
const group_id: u16 = 1; // buffers group id
|
|
const buffers_count: u16 = 2; // number of buffers in buffer group
|
|
const buffer_size: usize = 4; // size of each buffer in group
|
|
var buf_grp = BufferGroup.init(
|
|
&ring,
|
|
testing.allocator,
|
|
group_id,
|
|
buffer_size,
|
|
buffers_count,
|
|
) catch |err| switch (err) {
|
|
// kernel older than 5.19
|
|
error.ArgumentsInvalid => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer buf_grp.deinit(testing.allocator);
|
|
|
|
// create client/server fds
|
|
const fds = try createSocketTestHarness(&ring);
|
|
defer fds.close();
|
|
|
|
// for random user_data in sqe/cqe
|
|
var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed);
|
|
var rnd = Rnd.random();
|
|
|
|
var round: usize = 4; // repeat send/recv cycle round times
|
|
while (round > 0) : (round -= 1) {
|
|
// client sends data
|
|
const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe };
|
|
{
|
|
const user_data = rnd.int(u64);
|
|
_ = try ring.send(user_data, fds.client, data[0..], 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
const cqe_send = try ring.copy_cqe();
|
|
if (cqe_send.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send);
|
|
}
|
|
var pos: usize = 0;
|
|
|
|
// read first chunk
|
|
const cqe1 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64));
|
|
var buf = try buf_grp.get(cqe1);
|
|
try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf);
|
|
pos += buf.len;
|
|
// second chunk
|
|
const cqe2 = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64));
|
|
buf = try buf_grp.get(cqe2);
|
|
try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf);
|
|
pos += buf.len;
|
|
|
|
// both buffers provided to the kernel are used so we get error
|
|
// 'no more buffers', until we put buffers to the kernel
|
|
{
|
|
const user_data = rnd.int(u64);
|
|
_ = try buf_grp.recv(user_data, fds.server, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(user_data, cqe.user_data);
|
|
try testing.expect(cqe.res < 0); // fail
|
|
try testing.expectEqual(posix.E.NOBUFS, cqe.err());
|
|
try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only
|
|
try testing.expectError(error.NoBufferSelected, cqe.buffer_id());
|
|
}
|
|
|
|
// put buffers back to the kernel
|
|
try buf_grp.put(cqe1);
|
|
try buf_grp.put(cqe2);
|
|
|
|
// read remaining data
|
|
while (pos < data.len) {
|
|
const cqe = try buf_grp_recv_submit_get_cqe(&ring, &buf_grp, fds.server, rnd.int(u64));
|
|
buf = try buf_grp.get(cqe);
|
|
try testing.expectEqualSlices(u8, data[pos..][0..buf.len], buf);
|
|
pos += buf.len;
|
|
try buf_grp.put(cqe);
|
|
}
|
|
}
|
|
}
|
|
|
|
test "ring mapped buffers multishot recv" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(16, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
// init buffer group
|
|
const group_id: u16 = 1; // buffers group id
|
|
const buffers_count: u16 = 2; // number of buffers in buffer group
|
|
const buffer_size: usize = 4; // size of each buffer in group
|
|
var buf_grp = BufferGroup.init(
|
|
&ring,
|
|
testing.allocator,
|
|
group_id,
|
|
buffer_size,
|
|
buffers_count,
|
|
) catch |err| switch (err) {
|
|
// kernel older than 5.19
|
|
error.ArgumentsInvalid => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer buf_grp.deinit(testing.allocator);
|
|
|
|
// create client/server fds
|
|
const fds = try createSocketTestHarness(&ring);
|
|
defer fds.close();
|
|
|
|
// for random user_data in sqe/cqe
|
|
var Rnd = std.Random.DefaultPrng.init(std.testing.random_seed);
|
|
var rnd = Rnd.random();
|
|
|
|
var round: usize = 4; // repeat send/recv cycle round times
|
|
while (round > 0) : (round -= 1) {
|
|
// client sends data
|
|
const data = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf };
|
|
{
|
|
const user_data = rnd.int(u64);
|
|
_ = try ring.send(user_data, fds.client, data[0..], 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
const cqe_send = try ring.copy_cqe();
|
|
if (cqe_send.err() == .INVAL) return error.SkipZigTest;
|
|
try testing.expectEqual(Cqe{ .user_data = user_data, .res = data.len, .flags = 0 }, cqe_send);
|
|
}
|
|
|
|
// start multishot recv
|
|
var recv_user_data = rnd.int(u64);
|
|
_ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit
|
|
|
|
// server reads data into provided buffers
|
|
// there are 2 buffers of size 4, so each read gets only chunk of data
|
|
// we read four chunks of 4, 4, 4, 4 bytes each
|
|
var chunk: []const u8 = data[0..buffer_size]; // first chunk
|
|
const cqe1 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk);
|
|
try testing.expect(cqe1.flags.F_MORE);
|
|
|
|
chunk = data[buffer_size .. buffer_size * 2]; // second chunk
|
|
const cqe2 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk);
|
|
try testing.expect(cqe2.flags.F_MORE);
|
|
|
|
// both buffers provided to the kernel are used so we get error
|
|
// 'no more buffers', until we put buffers to the kernel
|
|
{
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(recv_user_data, cqe.user_data);
|
|
try testing.expect(cqe.res < 0); // fail
|
|
try testing.expectEqual(posix.E.NOBUFS, cqe.err());
|
|
try testing.expect(!cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flags is set on success only
|
|
// has more is not set
|
|
// indicates that multishot is finished
|
|
try testing.expect(!cqe.flags.F_MORE);
|
|
try testing.expectError(error.NoBufferSelected, cqe.buffer_id());
|
|
}
|
|
|
|
// put buffers back to the kernel
|
|
try buf_grp.put(cqe1);
|
|
try buf_grp.put(cqe2);
|
|
|
|
// restart multishot
|
|
recv_user_data = rnd.int(u64);
|
|
_ = try buf_grp.recv_multishot(recv_user_data, fds.server, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit
|
|
|
|
chunk = data[buffer_size * 2 .. buffer_size * 3]; // third chunk
|
|
const cqe3 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk);
|
|
try testing.expect(cqe3.flags.F_MORE);
|
|
try buf_grp.put(cqe3);
|
|
|
|
chunk = data[buffer_size * 3 ..]; // last chunk
|
|
const cqe4 = try expect_buf_grp_cqe(&ring, &buf_grp, recv_user_data, chunk);
|
|
try testing.expect(cqe4.flags.F_MORE);
|
|
try buf_grp.put(cqe4);
|
|
|
|
// cancel pending multishot recv operation
|
|
{
|
|
const cancel_user_data = rnd.int(u64);
|
|
_ = try ring.cancel(cancel_user_data, recv_user_data, 0);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit());
|
|
|
|
// expect completion of cancel operation and completion of recv operation
|
|
var cqe_cancel = try ring.copy_cqe();
|
|
if (cqe_cancel.err() == .INVAL) return error.SkipZigTest;
|
|
var cqe_recv = try ring.copy_cqe();
|
|
if (cqe_recv.err() == .INVAL) return error.SkipZigTest;
|
|
|
|
// don't depend on order of completions
|
|
if (cqe_cancel.user_data == recv_user_data and cqe_recv.user_data == cancel_user_data) {
|
|
const a = cqe_cancel;
|
|
const b = cqe_recv;
|
|
cqe_cancel = b;
|
|
cqe_recv = a;
|
|
}
|
|
|
|
// Note on different kernel results:
|
|
// on older kernel (tested with v6.0.16, v6.1.57, v6.2.12, v6.4.16)
|
|
// cqe_cancel.err() == .NOENT
|
|
// cqe_recv.err() == .NOBUFS
|
|
// on kernel (tested with v6.5.0, v6.5.7)
|
|
// cqe_cancel.err() == .SUCCESS
|
|
// cqe_recv.err() == .CANCELED
|
|
// Upstream reference: https://github.com/axboe/liburing/issues/984
|
|
|
|
// cancel operation is success (or NOENT on older kernels)
|
|
try testing.expectEqual(cancel_user_data, cqe_cancel.user_data);
|
|
try testing.expect(cqe_cancel.err() == .NOENT or cqe_cancel.err() == .SUCCESS);
|
|
|
|
// recv operation is failed with err CANCELED (or NOBUFS on older kernels)
|
|
try testing.expectEqual(recv_user_data, cqe_recv.user_data);
|
|
try testing.expect(cqe_recv.res < 0);
|
|
try testing.expect(cqe_recv.err() == .NOBUFS or cqe_recv.err() == .CANCELED);
|
|
try testing.expect(!cqe_recv.flags.F_MORE);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Prepare, submit recv and get cqe using buffer group.
|
|
fn buf_grp_recv_submit_get_cqe(
|
|
ring: *IoUring,
|
|
buf_grp: *BufferGroup,
|
|
fd: linux.fd_t,
|
|
user_data: u64,
|
|
) !Cqe {
|
|
// prepare and submit recv
|
|
const sqe = try buf_grp.recv(user_data, fd, 0);
|
|
try testing.expect(sqe.flags.BUFFER_SELECT);
|
|
try testing.expect(sqe.buf_index == buf_grp.group_id);
|
|
try testing.expectEqual(@as(u32, 1), try ring.submit()); // submit
|
|
// get cqe, expect success
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(user_data, cqe.user_data);
|
|
try testing.expect(cqe.res >= 0); // success
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set
|
|
|
|
return cqe;
|
|
}
|
|
|
|
fn expect_buf_grp_cqe(
|
|
ring: *IoUring,
|
|
buf_grp: *BufferGroup,
|
|
user_data: u64,
|
|
expected: []const u8,
|
|
) !Cqe {
|
|
// get cqe
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(user_data, cqe.user_data);
|
|
try testing.expect(cqe.res >= 0); // success
|
|
try testing.expect(cqe.flags.F_BUFFER); // IORING_CQE_F_BUFFER flag is set
|
|
try testing.expectEqual(expected.len, @as(usize, @intCast(cqe.res)));
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
|
|
// get buffer from pool
|
|
const buffer_id = try cqe.buffer_id();
|
|
const len = @as(usize, @intCast(cqe.res));
|
|
const buf = buf_grp.get_by_id(buffer_id)[0..len];
|
|
try testing.expectEqualSlices(u8, expected, buf);
|
|
|
|
return cqe;
|
|
}
|
|
|
|
test "copy_cqes with wrapping sq.cqes buffer" {
|
|
if (!is_linux) return error.SkipZigTest;
|
|
|
|
var ring = IoUring.init(2, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
try testing.expectEqual(2, ring.sq.sqes.len);
|
|
try testing.expectEqual(4, ring.cq.cqes.len);
|
|
|
|
// submit 2 entries, receive 2 completions
|
|
var cqes: [8]Cqe = undefined;
|
|
{
|
|
for (0..2) |_| {
|
|
const sqe = try ring.get_sqe();
|
|
sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, 0);
|
|
try testing.expect(try ring.submit() == 1);
|
|
}
|
|
var cqe_count: u32 = 0;
|
|
while (cqe_count < 2) {
|
|
cqe_count += try ring.copy_cqes(&cqes, 2 - cqe_count);
|
|
}
|
|
}
|
|
|
|
try testing.expectEqual(2, ring.cq.head.*);
|
|
|
|
// sq.sqes len is 4, starting at position 2
|
|
// every 4 entries submit wraps completion buffer
|
|
// we are reading ring.cq.cqes at indexes 2,3,0,1
|
|
for (1..1024) |i| {
|
|
for (0..4) |_| {
|
|
const sqe = try ring.get_sqe();
|
|
sqe.prep_timeout(&.{ .sec = 0, .nsec = 10000 }, 0, 0);
|
|
try testing.expect(try ring.submit() == 1);
|
|
}
|
|
var cqe_count: u32 = 0;
|
|
while (cqe_count < 4) {
|
|
cqe_count += try ring.copy_cqes(&cqes, 4 - cqe_count);
|
|
}
|
|
try testing.expectEqual(4, cqe_count);
|
|
try testing.expectEqual(2 + 4 * i, ring.cq.head.*);
|
|
}
|
|
}
|
|
|
|
test "bind/listen/connect" {
|
|
if (builtin.cpu.arch == .s390x) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/25956
|
|
|
|
var ring = IoUring.init(4, .{}) catch |err| switch (err) {
|
|
error.SystemOutdated => return error.SkipZigTest,
|
|
error.PermissionDenied => return error.SkipZigTest,
|
|
else => return err,
|
|
};
|
|
defer ring.deinit();
|
|
|
|
const probe = ring.get_probe() catch return error.SkipZigTest;
|
|
// LISTEN is higher required operation
|
|
if (!probe.is_supported(.LISTEN)) return error.SkipZigTest;
|
|
|
|
var addr: linux.sockaddr.in = .{
|
|
.port = 0,
|
|
.addr = @bitCast([4]u8{ 127, 0, 0, 1 }),
|
|
};
|
|
const proto: u32 = if (addr.family == linux.AF.UNIX) 0 else linux.IPPROTO.TCP;
|
|
|
|
const listen_fd = brk: {
|
|
// Create socket
|
|
_ = try ring.socket(1, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0);
|
|
try testing.expectEqual(1, try ring.submit());
|
|
var cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(1, cqe.user_data);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
const listen_fd: linux.fd_t = @intCast(cqe.res);
|
|
try testing.expect(listen_fd > 2);
|
|
|
|
// Prepare: set socket option * 2, bind, listen
|
|
var optval: u32 = 1;
|
|
(try ring.setsockopt(2, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval))).link_next();
|
|
(try ring.setsockopt(3, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEPORT, mem.asBytes(&optval))).link_next();
|
|
(try ring.bind(4, listen_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in), 0)).link_next();
|
|
_ = try ring.listen(5, listen_fd, 1, 0);
|
|
// Submit 4 operations
|
|
try testing.expectEqual(4, try ring.submit());
|
|
// Expect all to succeed
|
|
for (2..6) |user_data| {
|
|
cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(user_data, cqe.user_data);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
}
|
|
|
|
// Check that socket option is set
|
|
optval = 0;
|
|
_ = try ring.getsockopt(5, listen_fd, linux.SOL.SOCKET, linux.SO.REUSEADDR, mem.asBytes(&optval));
|
|
try testing.expectEqual(1, try ring.submit());
|
|
cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(5, cqe.user_data);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expectEqual(1, optval);
|
|
|
|
// Read system assigned port into addr
|
|
var addr_len: posix.socklen_t = @sizeOf(linux.sockaddr.in);
|
|
try posix.getsockname(listen_fd, addrAny(&addr), &addr_len);
|
|
|
|
break :brk listen_fd;
|
|
};
|
|
|
|
const connect_fd = brk: {
|
|
// Create connect socket
|
|
_ = try ring.socket(6, addr.family, linux.SOCK.STREAM | linux.SOCK.CLOEXEC, proto, 0);
|
|
try testing.expectEqual(1, try ring.submit());
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(6, cqe.user_data);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
// Get connect socket fd
|
|
const connect_fd: linux.fd_t = @intCast(cqe.res);
|
|
try testing.expect(connect_fd > 2 and connect_fd != listen_fd);
|
|
break :brk connect_fd;
|
|
};
|
|
|
|
// Prepare accept/connect operations
|
|
_ = try ring.accept(7, listen_fd, null, null, 0);
|
|
_ = try ring.connect(8, connect_fd, addrAny(&addr), @sizeOf(linux.sockaddr.in));
|
|
try testing.expectEqual(2, try ring.submit());
|
|
// Get listener accepted socket
|
|
var accept_fd: posix.socket_t = 0;
|
|
for (0..2) |_| {
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
if (cqe.user_data == 7) {
|
|
accept_fd = @intCast(cqe.res);
|
|
} else {
|
|
try testing.expectEqual(8, cqe.user_data);
|
|
}
|
|
}
|
|
try testing.expect(accept_fd > 2 and accept_fd != listen_fd and accept_fd != connect_fd);
|
|
|
|
// Communicate
|
|
try testSendRecv(&ring, connect_fd, accept_fd);
|
|
try testSendRecv(&ring, accept_fd, connect_fd);
|
|
|
|
// Shutdown and close all sockets
|
|
for ([_]posix.socket_t{ connect_fd, accept_fd, listen_fd }) |fd| {
|
|
(try ring.shutdown(9, fd, posix.SHUT.RDWR)).link_next();
|
|
_ = try ring.close(10, fd);
|
|
try testing.expectEqual(2, try ring.submit());
|
|
for (0..2) |i| {
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expectEqual(9 + i, cqe.user_data);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn testSendRecv(ring: *IoUring, send_fd: posix.socket_t, recv_fd: posix.socket_t) !void {
|
|
const buffer_send = "0123456789abcdf" ** 10;
|
|
var buffer_recv: [buffer_send.len * 2]u8 = undefined;
|
|
|
|
// 2 sends
|
|
_ = try ring.send(1, send_fd, buffer_send, linux.MSG.WAITALL);
|
|
_ = try ring.send(2, send_fd, buffer_send, linux.MSG.WAITALL);
|
|
try testing.expectEqual(2, try ring.submit());
|
|
for (0..2) |i| {
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(1 + i, cqe.user_data);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
try testing.expectEqual(buffer_send.len, @as(usize, @intCast(cqe.res)));
|
|
}
|
|
|
|
// receive
|
|
var recv_len: usize = 0;
|
|
while (recv_len < buffer_send.len * 2) {
|
|
_ = try ring.recv(3, recv_fd, .{ .buffer = buffer_recv[recv_len..] }, 0);
|
|
try testing.expectEqual(1, try ring.submit());
|
|
const cqe = try ring.copy_cqe();
|
|
try testing.expectEqual(3, cqe.user_data);
|
|
try testing.expectEqual(posix.E.SUCCESS, cqe.err());
|
|
recv_len += @intCast(cqe.res);
|
|
}
|
|
|
|
// inspect recv buffer
|
|
try testing.expectEqualSlices(u8, buffer_send, buffer_recv[0..buffer_send.len]);
|
|
try testing.expectEqualSlices(u8, buffer_send, buffer_recv[buffer_send.len..]);
|
|
}
|
|
|
|
fn addrAny(addr: *linux.sockaddr.in) *linux.sockaddr {
|
|
return @ptrCast(addr);
|
|
}
|