From de87bad4c3240ab34ee81023e6df1ba94d6d5e6a Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Sat, 29 Nov 2025 08:43:57 -0800 Subject: [PATCH] std.Io.Threaded: don't solve the cancel race after all Unfortunately, trying again until the cancellation request is acknowledged has been observed to incur a large amount of overhead, and usually strong cancellation guarantees are not needed, so the race condition is not handled here. Users who want to avoid this have this menu of options instead: * Use no libc, in which case Zig std lib can avoid the race (tracking issue: https://codeberg.org/ziglang/zig/issues/30049) * Use musl libc * Use `std.Io.Evented`. But this is not implemented yet. Tracked by - https://codeberg.org/ziglang/zig/issues/30050 - https://codeberg.org/ziglang/zig/issues/30051 glibc + threaded is the only problematic combination. --- lib/std/Io/Threaded.zig | 76 +++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 49 deletions(-) diff --git a/lib/std/Io/Threaded.zig b/lib/std/Io/Threaded.zig index 03670a9eb7..f285a51d6c 100644 --- a/lib/std/Io/Threaded.zig +++ b/lib/std/Io/Threaded.zig @@ -201,7 +201,7 @@ const Closure = struct { const Start = *const fn (*Closure, *Threaded) void; fn requestCancel(closure: *Closure, t: *Threaded) void { - var signal_id = switch (@atomicRmw(CancelStatus, &closure.cancel_status, .Xchg, .requested, .monotonic).unpack()) { + const signal_id = switch (@atomicRmw(CancelStatus, &closure.cancel_status, .Xchg, .requested, .monotonic).unpack()) { .none, .acknowledged, .requested => return, .signal_id => |signal_id| signal_id, }; @@ -214,54 +214,32 @@ const Closure = struct { // The task will enter a blocking syscall before checking for cancellation again. // We can send a signal to interrupt the syscall, but if it arrives before - // the syscall instruction, it will be missed. Therefore, this code tries - // again until the cancellation request is acknowledged. - - // 1 << 10 ns is about 1 microsecond, approximately syscall overhead. - // 1 << 20 ns is about 1 millisecond. - // 1 << 30 ns is about 1 second. + // the syscall instruction, it will be missed. // - // On a heavily loaded Linux 6.17.5, I observed a maximum of 20 - // attempts not acknowledged before the timeout (including exponential - // backoff) was sufficient, despite the heavy load. - // - // The time wasted here sleeping is mitigated by the fact that, later - // on, the system will likely wait for the canceled task, causing it - // to indefinitely yield until the canceled task finishes, and the - // task must acknowledge the cancel before it proceeds to that point. - const max_attempts = 22; - - for (0..max_attempts) |attempt_index| { - if (std.Thread.use_pthreads) { - if (std.c.pthread_kill(signal_id, .IO) != 0) return; - } else if (native_os == .linux) { - const pid: posix.pid_t = p: { - const cached_pid = @atomicLoad(Pid, &t.pid, .monotonic); - if (cached_pid != .unknown) break :p @intFromEnum(cached_pid); - const pid = std.os.linux.getpid(); - @atomicStore(Pid, &t.pid, @enumFromInt(pid), .monotonic); - break :p pid; - }; - if (std.os.linux.tgkill(pid, @bitCast(signal_id), .IO) != 0) return; - } else { - return; - } - - var timespec: posix.timespec = .{ - .sec = 0, - .nsec = @as(isize, 1) << @intCast(attempt_index), + // Unfortunately, trying again until the cancellation request is + // acknowledged has been observed to incur a large amount of overhead, + // and usually strong cancellation guarantees are not needed, so the + // race condition is not handled here. Users who want to avoid this + // have this menu of options instead: + // * Use no libc, in which case Zig std lib can avoid the race (tracking + // issue: https://codeberg.org/ziglang/zig/issues/30049) + // * Use musl libc instead of glibc + // * Use `std.Io.Evented`. But this is not implemented yet. Tracked by + // - https://codeberg.org/ziglang/zig/issues/30050 + // - https://codeberg.org/ziglang/zig/issues/30051 + if (std.Thread.use_pthreads) { + if (std.c.pthread_kill(signal_id, .IO) != 0) return; + } else if (native_os == .linux) { + const pid: posix.pid_t = p: { + const cached_pid = @atomicLoad(Pid, &t.pid, .monotonic); + if (cached_pid != .unknown) break :p @intFromEnum(cached_pid); + const pid = std.os.linux.getpid(); + @atomicStore(Pid, &t.pid, @enumFromInt(pid), .monotonic); + break :p pid; }; - if (native_os == .linux) { - _ = std.os.linux.clock_nanosleep(posix.CLOCK.MONOTONIC, .{ .ABSTIME = false }, ×pec, ×pec); - } else { - _ = posix.system.nanosleep(×pec, ×pec); - } - - switch (@atomicRmw(CancelStatus, &closure.cancel_status, .Xchg, .requested, .monotonic).unpack()) { - .requested => continue, // Retry needed in case other thread hasn't yet entered the syscall. - .none, .acknowledged => return, - .signal_id => |new_signal_id| signal_id = new_signal_id, - } + if (std.os.linux.tgkill(pid, @bitCast(signal_id), .IO) != 0) return; + } else { + return; } } }; @@ -303,7 +281,7 @@ pub fn init( .mask = posix.sigemptyset(), .flags = 0, }; - if (have_sig_io) posix.sigaction(.IO, &act, &t.old_sig_io); + if (!is_musl and have_sig_io) posix.sigaction(.IO, &act, &t.old_sig_io); if (have_sig_pipe) posix.sigaction(.PIPE, &act, &t.old_sig_pipe); t.have_signal_handler = true; } @@ -341,7 +319,7 @@ pub fn deinit(t: *Threaded) void { if (ws2_32.WSACleanup() != 0) recoverableOsBugDetected(); } if (posix.Sigaction != void and t.have_signal_handler) { - if (have_sig_io) posix.sigaction(.IO, &t.old_sig_io, null); + if (!is_musl and have_sig_io) posix.sigaction(.IO, &t.old_sig_io, null); if (have_sig_pipe) posix.sigaction(.PIPE, &t.old_sig_pipe, null); } t.* = undefined;