From 0005e7e70b96ecedee64b7259911ae857b8cf676 Mon Sep 17 00:00:00 2001 From: joadnacer Date: Fri, 27 Oct 2023 00:31:52 +0100 Subject: [PATCH 1/2] std.linux: Update io_uring structs and consts for kernel 6.3.8 --- lib/std/os/linux.zig | 343 +++++++++++++++++++++++++--------- lib/std/os/linux/io_uring.zig | 24 ++- 2 files changed, 267 insertions(+), 100 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index d2a6b5f55e..d1202dc2ed 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -3740,35 +3740,6 @@ else fields: siginfo_fields_union, }; -pub const io_uring_params = extern struct { - sq_entries: u32, - cq_entries: u32, - flags: u32, - sq_thread_cpu: u32, - sq_thread_idle: u32, - features: u32, - wq_fd: u32, - resv: [3]u32, - sq_off: io_sqring_offsets, - cq_off: io_cqring_offsets, -}; - -// io_uring_params.features flags - -pub const IORING_FEAT_SINGLE_MMAP = 1 << 0; -pub const IORING_FEAT_NODROP = 1 << 1; -pub const IORING_FEAT_SUBMIT_STABLE = 1 << 2; -pub const IORING_FEAT_RW_CUR_POS = 1 << 3; -pub const IORING_FEAT_CUR_PERSONALITY = 1 << 4; -pub const IORING_FEAT_FAST_POLL = 1 << 5; -pub const IORING_FEAT_POLL_32BITS = 1 << 6; -pub const IORING_FEAT_SQPOLL_NONFIXED = 1 << 7; -pub const IORING_FEAT_EXT_ARG = 1 << 8; -pub const IORING_FEAT_NATIVE_WORKERS = 1 << 9; -pub const IORING_FEAT_RSRC_TAGS = 1 << 10; -pub const IORING_FEAT_CQE_SKIP = 1 << 11; -pub const IORING_FEAT_LINKED_FILE = 1 << 12; - // io_uring_params.flags /// io_context is polled @@ -3812,53 +3783,15 @@ pub const IORING_SETUP_SQE128 = 1 << 10; /// CQEs are 32 byte pub const IORING_SETUP_CQE32 = 1 << 11; -pub const io_sqring_offsets = extern struct { - /// offset of ring head - head: u32, +/// Only one task is allowed to submit requests +pub const IORING_SETUP_SINGLE_ISSUER = 1 << 12; - /// offset of ring tail - tail: u32, - - /// ring mask value - ring_mask: u32, - - /// entries in ring - ring_entries: u32, - - /// ring flags - flags: u32, - - /// number of sqes not submitted - dropped: u32, - - /// sqe index array - array: u32, - - resv1: u32, - user_addr: u64, -}; - -// io_sqring_offsets.flags - -/// needs io_uring_enter wakeup -pub const IORING_SQ_NEED_WAKEUP = 1 << 0; -/// kernel has cqes waiting beyond the cq ring -pub const IORING_SQ_CQ_OVERFLOW = 1 << 1; -/// task should enter the kernel -pub const IORING_SQ_TASKRUN = 1 << 2; - -pub const io_cqring_offsets = extern struct { - head: u32, - tail: u32, - ring_mask: u32, - ring_entries: u32, - overflow: u32, - cqes: u32, - flags: u32, - resv: u32, - user_addr: u64, -}; +/// Defer running task work to get events. +/// Rather than running bits of task work whenever the task transitions +/// try to do it just before it is needed. +pub const IORING_SETUP_DEFER_TASKRUN = 1 << 13; +/// IO submission data structure (Submission Queue Entry) pub const io_uring_sqe = extern struct { opcode: IORING_OP, flags: u8, @@ -3872,9 +3805,18 @@ pub const io_uring_sqe = extern struct { buf_index: u16, personality: u16, splice_fd_in: i32, - __pad2: [2]u64, + addr3: u64, + resv: u64, }; +/// If sqe->file_index is set to this for opcodes that instantiate a new +/// direct descriptor (like openat/openat2/accept), then io_uring will allocate +/// an available direct descriptor instead of having the application pass one +/// in. The picked direct descriptor will be returned in cqe->res, or -ENFILE +/// if the space is full. +/// Available since Linux 5.19 +pub const IORING_FILE_INDEX_ALLOC = maxInt(u32); + pub const IOSQE_BIT = enum(u8) { FIXED_FILE, IO_DRAIN, @@ -3964,6 +3906,10 @@ pub const IORING_OP = enum(u8) { _, }; +// io_uring_sqe.uring_cmd_flags (rw_flags in the Zig struct) + +/// use registered buffer; pass thig flag along with setting sqe->buf_index. +pub const IORING_URING_CMD_FIXED = 1 << 0; // io_uring_sqe.fsync_flags (rw_flags in the Zig struct) pub const IORING_FSYNC_DATASYNC = 1 << 0; @@ -3990,6 +3936,7 @@ pub const IORING_POLL_ADD_MULTI = 1 << 0; /// Update existing poll request, matching sqe->addr as the old user_data field. pub const IORING_POLL_UPDATE_EVENTS = 1 << 1; pub const IORING_POLL_UPDATE_USER_DATA = 1 << 2; +pub const IORING_POLL_ADD_LEVEL = 1 << 3; // ASYNC_CANCEL flags. @@ -3999,6 +3946,8 @@ pub const IORING_ASYNC_CANCEL_ALL = 1 << 0; pub const IORING_ASYNC_CANCEL_FD = 1 << 1; /// Match any request pub const IORING_ASYNC_CANCEL_ANY = 1 << 2; +/// 'fd' passed in is a fixed descriptor. Available since Linux 6.0 +pub const IORING_ASYNC_CANCEL_FD_FIXED = 1 << 3; // send/sendmsg and recv/recvmsg flags (sqe->ioprio) @@ -4007,10 +3956,32 @@ pub const IORING_ASYNC_CANCEL_ANY = 1 << 2; pub const IORING_RECVSEND_POLL_FIRST = 1 << 0; /// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue to report CQEs on behalf of the same SQE. pub const IORING_RECV_MULTISHOT = 1 << 1; +/// Use registered buffers, the index is stored in the buf_index field. +pub const IORING_RECVSEND_FIXED_BUF = 1 << 2; +/// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res for the IORING_CQE_F_NOTIF cqe. +pub const IORING_SEND_ZC_REPORT_USAGE = 1 << 3; +/// CQE.RES FOR IORING_CQE_F_NOTIF if IORING_SEND_ZC_REPORT_USAGE was requested +pub const IORING_NOTIF_USAGE_ZC_COPIED = 1 << 31; -/// accept flags stored in sqe->ioprio +/// accept flags stored in sqe->iopri pub const IORING_ACCEPT_MULTISHOT = 1 << 0; +/// IORING_OP_MSG_RING command types, stored in sqe->addr +pub const IORING_MSG_RING_COMMAND = enum(u8) { + /// pass sqe->len as 'res' and off as user_data + DATA, + /// send a registered fd to another ring + SEND_FD, +}; + +// io_uring_sqe.msg_ring_flags (rw_flags in the Zig struct) + +/// Don't post a CQE to the target ring. Not applicable for IORING_MSG_DATA, obviously. +pub const IORING_MSG_RING_CQE_SKIP = 1 << 0; + +/// Pass through the flags from sqe->file_index (splice_fd_in in the zig struct) to cqe->flags */ +pub const IORING_MSG_RING_FLAGS_PASS = 1 << 1; + // IO completion data structure (Completion Queue Entry) pub const io_uring_cqe = extern struct { /// io_uring_sqe.data submission passed back @@ -4020,6 +3991,8 @@ pub const io_uring_cqe = extern struct { res: i32, flags: u32, + // Followed by 16 bytes of padding if initialized with IORING_SETUP_CQE32, doubling cqe size + pub fn err(self: io_uring_cqe) E { if (self.res > -4096 and self.res < 0) { return @as(E, @enumFromInt(-self.res)); @@ -4040,11 +4013,66 @@ pub const IORING_CQE_F_SOCK_NONEMPTY = 1 << 2; /// Set for notification CQEs. Can be used to distinct them from sends. pub const IORING_CQE_F_NOTIF = 1 << 3; +pub const IORING_CQE_BUFFER_SHIFT = 16; + /// Magic offsets for the application to mmap the data it needs pub const IORING_OFF_SQ_RING = 0; pub const IORING_OFF_CQ_RING = 0x8000000; pub const IORING_OFF_SQES = 0x10000000; +/// Filled with the offset for mmap(2) +pub const io_sqring_offsets = extern struct { + /// offset of ring head + head: u32, + + /// offset of ring tail + tail: u32, + + /// ring mask value + ring_mask: u32, + + /// entries in ring + ring_entries: u32, + + /// ring flags + flags: u32, + + /// number of sqes not submitted + dropped: u32, + + /// sqe index array + array: u32, + + resv1: u32, + resv2: u64, +}; + +// io_sqring_offsets.flags + +/// needs io_uring_enter wakeup +pub const IORING_SQ_NEED_WAKEUP = 1 << 0; +/// kernel has cqes waiting beyond the cq ring +pub const IORING_SQ_CQ_OVERFLOW = 1 << 1; +/// task should enter the kernel +pub const IORING_SQ_TASKRUN = 1 << 2; + +pub const io_cqring_offsets = extern struct { + head: u32, + tail: u32, + ring_mask: u32, + ring_entries: u32, + overflow: u32, + cqes: u32, + flags: u32, + resv: u32, + user_addr: u64, +}; + +// io_cqring_offsets.flags + +/// disable eventfd notifications +pub const IORING_CQ_EVENTFD_DISABLED = 1 << 0; + // io_uring_enter flags pub const IORING_ENTER_GETEVENTS = 1 << 0; pub const IORING_ENTER_SQ_WAKEUP = 1 << 1; @@ -4052,8 +4080,37 @@ pub const IORING_ENTER_SQ_WAIT = 1 << 2; pub const IORING_ENTER_EXT_ARG = 1 << 3; pub const IORING_ENTER_REGISTERED_RING = 1 << 4; +pub const io_uring_params = extern struct { + sq_entries: u32, + cq_entries: u32, + flags: u32, + sq_thread_cpu: u32, + sq_thread_idle: u32, + features: u32, + wq_fd: u32, + resv: [3]u32, + sq_off: io_sqring_offsets, + cq_off: io_cqring_offsets, +}; + +// io_uring_params.features flags + +pub const IORING_FEAT_SINGLE_MMAP = 1 << 0; +pub const IORING_FEAT_NODROP = 1 << 1; +pub const IORING_FEAT_SUBMIT_STABLE = 1 << 2; +pub const IORING_FEAT_RW_CUR_POS = 1 << 3; +pub const IORING_FEAT_CUR_PERSONALITY = 1 << 4; +pub const IORING_FEAT_FAST_POLL = 1 << 5; +pub const IORING_FEAT_POLL_32BITS = 1 << 6; +pub const IORING_FEAT_SQPOLL_NONFIXED = 1 << 7; +pub const IORING_FEAT_EXT_ARG = 1 << 8; +pub const IORING_FEAT_NATIVE_WORKERS = 1 << 9; +pub const IORING_FEAT_RSRC_TAGS = 1 << 10; +pub const IORING_FEAT_CQE_SKIP = 1 << 11; +pub const IORING_FEAT_LINKED_FILE = 1 << 12; + // io_uring_register opcodes and arguments -pub const IORING_REGISTER = enum(u8) { +pub const IORING_REGISTER = enum(u32) { REGISTER_BUFFERS, UNREGISTER_BUFFERS, REGISTER_FILES, @@ -4069,41 +4126,93 @@ pub const IORING_REGISTER = enum(u8) { REGISTER_ENABLE_RINGS, // extended with tagging - IORING_REGISTER_FILES2, - IORING_REGISTER_FILES_UPDATE2, - IORING_REGISTER_BUFFERS2, - IORING_REGISTER_BUFFERS_UPDATE, + REGISTER_FILES2, + REGISTER_FILES_UPDATE2, + REGISTER_BUFFERS2, + REGISTER_BUFFERS_UPDATE, // set/clear io-wq thread affinities - IORING_REGISTER_IOWQ_AFF, - IORING_UNREGISTER_IOWQ_AFF, + REGISTER_IOWQ_AFF, + UNREGISTER_IOWQ_AFF, // set/get max number of io-wq workers - IORING_REGISTER_IOWQ_MAX_WORKERS, + REGISTER_IOWQ_MAX_WORKERS, // register/unregister io_uring fd with the ring - IORING_REGISTER_RING_FDS, - IORING_UNREGISTER_RING_FDS, + REGISTER_RING_FDS, + NREGISTER_RING_FDS, // register ring based provide buffer group - IORING_REGISTER_PBUF_RING, - IORING_UNREGISTER_PBUF_RING, + REGISTER_PBUF_RING, + UNREGISTER_PBUF_RING, // sync cancelation API - IORING_REGISTER_SYNC_CANCEL, + REGISTER_SYNC_CANCEL, // register a range of fixed file slots for automatic slot allocation - IORING_REGISTER_FILE_ALLOC_RANGE, + REGISTER_FILE_ALLOC_RANGE, + + // flag added to the opcode to use a registered ring fd + IORING_REGISTER_USE_REGISTERED_RING = 1 << 31, _, }; +/// io_uring_restriction->opcode values +pub const IOWQ_CATEGORIES = enum(u8) { + BOUND, + UNBOUND, +}; + +/// deprecated, see struct io_uring_rsrc_update pub const io_uring_files_update = extern struct { offset: u32, resv: u32, fds: u64, }; +/// Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. +pub const IORING_RSRC_REGISTER_SPARSE = 1 << 0; + +pub const io_uring_rsrc_register = extern struct { + nr: u32, + flags: u32, + resv2: u64, + data: u64, + tags: u64, +}; + +pub const io_uring_rsrc_update = extern struct { + offset: u32, + resv: u32, + data: u64, +}; + +pub const io_uring_rsrc_update2 = extern struct { + offset: u32, + resv: u32, + data: u64, + tags: u64, + nr: u32, + resv2: u32, +}; + +pub const io_uring_notification_slot = extern struct { + tag: u64, + resv: [3]u64, +}; + +pub const io_uring_notification_register = extern struct { + nr_slots: u32, + resv: u32, + resv2: u64, + data: u64, + resv3: u64, +}; + +/// Skip updating fd indexes set to this value in the fd table */ +pub const IORING_REGISTER_FILES_SKIP = -2; + pub const IO_URING_OP_SUPPORTED = 1 << 0; pub const io_uring_probe_op = extern struct { @@ -4131,7 +4240,7 @@ pub const io_uring_probe = extern struct { }; pub const io_uring_restriction = extern struct { - opcode: u16, + opcode: IORING_RESTRICTION, arg: extern union { /// IORING_RESTRICTION_REGISTER_OP register_op: IORING_REGISTER, @@ -4147,7 +4256,7 @@ pub const io_uring_restriction = extern struct { }; /// io_uring_restriction->opcode values -pub const IORING_RESTRICTION = enum(u8) { +pub const IORING_RESTRICTION = enum(u16) { /// Allow an io_uring_register(2) opcode REGISTER_OP = 0, @@ -4163,6 +4272,56 @@ pub const IORING_RESTRICTION = enum(u8) { _, }; +pub const io_uring_buf = extern struct { + addr: u64, + len: u32, + bid: u16, + resv: u16, +}; + +// io_uring_buf_ring struct omitted +// it's a io_uring_buf array with the resv of the first item used as a "tail" field. + +/// argument for IORING_(UN)REGISTER_PBUF_RING +pub const io_uring_buf_reg = extern struct { + ring_addr: u64, + ring_entries: u32, + bgid: u16, + pad: u16, + resv: [3]u64, +}; + +pub const io_uring_getevents_arg = extern struct { + sigmask: u64, + sigmask_sz: u32, + pad: u32, + ts: u64, +}; + +/// Argument for IORING_REGISTER_SYNC_CANCEL +pub const io_uring_sync_cancel_reg = extern struct { + addr: u64, + fd: i32, + flags: u32, + timeout: kernel_timespec, + pad: [4]u64, +}; + +/// Argument for IORING_REGISTER_FILE_ALLOC_RANGE +/// The range is specified as [off, off + len) +pub const io_uring_file_index_range = extern struct { + off: u32, + len: u32, + resv: u64, +}; + +pub const io_uring_recvmsg_out = extern struct { + namelen: u32, + controllen: u32, + payloadlen: u32, + flags: u32, +}; + pub const utsname = extern struct { sysname: [64:0]u8, nodename: [64:0]u8, diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index 915036d962..f9dcbd6d8d 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -1210,7 +1210,8 @@ pub fn io_uring_prep_nop(sqe: *linux.io_uring_sqe) void { .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }; } @@ -1228,7 +1229,8 @@ pub fn io_uring_prep_fsync(sqe: *linux.io_uring_sqe, fd: os.fd_t, flags: u32) vo .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }; } @@ -1253,7 +1255,8 @@ pub fn io_uring_prep_rw( .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }; } @@ -1397,7 +1400,8 @@ pub fn io_uring_prep_close(sqe: *linux.io_uring_sqe, fd: os.fd_t) void { .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }; } @@ -1425,7 +1429,8 @@ pub fn io_uring_prep_timeout_remove(sqe: *linux.io_uring_sqe, timeout_user_data: .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }; } @@ -1485,7 +1490,8 @@ pub fn io_uring_prep_fallocate( .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }; } @@ -1657,7 +1663,8 @@ test "nop" { .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }, sqe.*); try testing.expectEqual(@as(u32, 0), ring.sq.sqe_head); @@ -2028,7 +2035,8 @@ test "openat" { .buf_index = 0, .personality = 0, .splice_fd_in = 0, - .__pad2 = [2]u64{ 0, 0 }, + .addr3 = 0, + .resv = 0, }, sqe_openat.*); try testing.expectEqual(@as(u32, 1), try ring.submit()); From 28b848e3f09f2774a06863438ffff59a8e67f780 Mon Sep 17 00:00:00 2001 From: joadnacer Date: Fri, 27 Oct 2023 00:31:55 +0100 Subject: [PATCH 2/2] std.io_uring: Improve splice implementation --- lib/std/os/linux/io_uring.zig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/std/os/linux/io_uring.zig b/lib/std/os/linux/io_uring.zig index f9dcbd6d8d..08d34e2fff 100644 --- a/lib/std/os/linux/io_uring.zig +++ b/lib/std/os/linux/io_uring.zig @@ -415,10 +415,10 @@ pub const IO_Uring = struct { /// Queues (but does not submit) an SQE to perform a `splice(2)` /// Either `fd_in` or `fd_out` must be a pipe. - /// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to -1. - /// If `fd_in` does not refer to a pipe and `off_in` is -1, then `len` are read + /// If `fd_in` refers to a pipe, `off_in` is ignored and must be set to std.math.maxInt(u64). + /// If `fd_in` does not refer to a pipe and `off_in` is maxInt(u64), then `len` are read /// from `fd_in` starting from the file offset, which is incremented by the number of bytes read. - /// If `fd_in` does not refer to a pipe and `off_in` is not -1, then the starting offset of `fd_in` will be `off_in`. + /// If `fd_in` does not refer to a pipe and `off_in` is not maxInt(u64), then the starting offset of `fd_in` will be `off_in`. /// This splice operation can be used to implement sendfile by splicing to an intermediate pipe first, /// then splice to the final destination. In fact, the implementation of sendfile in kernel uses splice internally. /// @@ -427,7 +427,7 @@ pub const IO_Uring = struct { /// See https://github.com/axboe/liburing/issues/291 /// /// Returns a pointer to the SQE so that you can further modify the SQE for advanced use cases. - pub fn splice(self: *IO_Uring, user_data: u64, fd_in: os.fd_t, off_in: i64, fd_out: os.fd_t, off_out: i64, len: usize) !*linux.io_uring_sqe { + pub fn splice(self: *IO_Uring, user_data: u64, fd_in: os.fd_t, off_in: u64, fd_out: os.fd_t, off_out: u64, len: usize) !*linux.io_uring_sqe { const sqe = try self.get_sqe(); io_uring_prep_splice(sqe, fd_in, off_in, fd_out, off_out, len); sqe.user_data = user_data; @@ -1268,9 +1268,9 @@ pub fn io_uring_prep_write(sqe: *linux.io_uring_sqe, fd: os.fd_t, buffer: []cons io_uring_prep_rw(.WRITE, sqe, fd, @intFromPtr(buffer.ptr), buffer.len, offset); } -pub fn io_uring_prep_splice(sqe: *linux.io_uring_sqe, fd_in: os.fd_t, off_in: i64, fd_out: os.fd_t, off_out: i64, len: usize) void { - io_uring_prep_rw(.SPLICE, sqe, fd_out, undefined, len, @bitCast(off_out)); - sqe.addr = @bitCast(off_in); +pub fn io_uring_prep_splice(sqe: *linux.io_uring_sqe, fd_in: os.fd_t, off_in: u64, fd_out: os.fd_t, off_out: u64, len: usize) void { + io_uring_prep_rw(.SPLICE, sqe, fd_out, undefined, len, off_out); + sqe.addr = off_in; sqe.splice_fd_in = fd_in; } @@ -1888,17 +1888,17 @@ test "splice/read" { _ = try file_src.write(&buffer_write); var fds = try os.pipe(); - const pipe_offset: i64 = -1; + const pipe_offset: u64 = std.math.maxInt(u64); const sqe_splice_to_pipe = try ring.splice(0x11111111, fd_src, 0, fds[1], pipe_offset, buffer_write.len); try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_to_pipe.opcode); try testing.expectEqual(@as(u64, 0), sqe_splice_to_pipe.addr); - try testing.expectEqual(@as(u64, @bitCast((pipe_offset))), sqe_splice_to_pipe.off); + try testing.expectEqual(pipe_offset, sqe_splice_to_pipe.off); sqe_splice_to_pipe.flags |= linux.IOSQE_IO_LINK; const sqe_splice_from_pipe = try ring.splice(0x22222222, fds[0], pipe_offset, fd_dst, 10, buffer_write.len); try testing.expectEqual(linux.IORING_OP.SPLICE, sqe_splice_from_pipe.opcode); - try testing.expectEqual(@as(u64, @bitCast(pipe_offset)), sqe_splice_from_pipe.addr); + try testing.expectEqual(pipe_offset, sqe_splice_from_pipe.addr); try testing.expectEqual(@as(u64, 10), sqe_splice_from_pipe.off); sqe_splice_from_pipe.flags |= linux.IOSQE_IO_LINK;