std.Io.Threaded: don't solve the cancel race after all

Unfortunately, trying again until the cancellation request is
acknowledged has been observed to incur a large amount of overhead,
and usually strong cancellation guarantees are not needed, so the
race condition is not handled here. Users who want to avoid this
have this menu of options instead:
* Use no libc, in which case Zig std lib can avoid the race (tracking
  issue: https://codeberg.org/ziglang/zig/issues/30049)
* Use musl libc
* Use `std.Io.Evented`. But this is not implemented yet. Tracked by
  - https://codeberg.org/ziglang/zig/issues/30050
  - https://codeberg.org/ziglang/zig/issues/30051

glibc + threaded is the only problematic combination.
This commit is contained in:
Andrew Kelley 2025-11-29 08:43:57 -08:00
parent 144206856e
commit de87bad4c3

View file

@ -201,7 +201,7 @@ const Closure = struct {
const Start = *const fn (*Closure, *Threaded) void; const Start = *const fn (*Closure, *Threaded) void;
fn requestCancel(closure: *Closure, t: *Threaded) void { fn requestCancel(closure: *Closure, t: *Threaded) void {
var signal_id = switch (@atomicRmw(CancelStatus, &closure.cancel_status, .Xchg, .requested, .monotonic).unpack()) { const signal_id = switch (@atomicRmw(CancelStatus, &closure.cancel_status, .Xchg, .requested, .monotonic).unpack()) {
.none, .acknowledged, .requested => return, .none, .acknowledged, .requested => return,
.signal_id => |signal_id| signal_id, .signal_id => |signal_id| signal_id,
}; };
@ -214,54 +214,32 @@ const Closure = struct {
// The task will enter a blocking syscall before checking for cancellation again. // The task will enter a blocking syscall before checking for cancellation again.
// We can send a signal to interrupt the syscall, but if it arrives before // We can send a signal to interrupt the syscall, but if it arrives before
// the syscall instruction, it will be missed. Therefore, this code tries // the syscall instruction, it will be missed.
// again until the cancellation request is acknowledged.
// 1 << 10 ns is about 1 microsecond, approximately syscall overhead.
// 1 << 20 ns is about 1 millisecond.
// 1 << 30 ns is about 1 second.
// //
// On a heavily loaded Linux 6.17.5, I observed a maximum of 20 // Unfortunately, trying again until the cancellation request is
// attempts not acknowledged before the timeout (including exponential // acknowledged has been observed to incur a large amount of overhead,
// backoff) was sufficient, despite the heavy load. // and usually strong cancellation guarantees are not needed, so the
// // race condition is not handled here. Users who want to avoid this
// The time wasted here sleeping is mitigated by the fact that, later // have this menu of options instead:
// on, the system will likely wait for the canceled task, causing it // * Use no libc, in which case Zig std lib can avoid the race (tracking
// to indefinitely yield until the canceled task finishes, and the // issue: https://codeberg.org/ziglang/zig/issues/30049)
// task must acknowledge the cancel before it proceeds to that point. // * Use musl libc instead of glibc
const max_attempts = 22; // * Use `std.Io.Evented`. But this is not implemented yet. Tracked by
// - https://codeberg.org/ziglang/zig/issues/30050
for (0..max_attempts) |attempt_index| { // - https://codeberg.org/ziglang/zig/issues/30051
if (std.Thread.use_pthreads) { if (std.Thread.use_pthreads) {
if (std.c.pthread_kill(signal_id, .IO) != 0) return; if (std.c.pthread_kill(signal_id, .IO) != 0) return;
} else if (native_os == .linux) { } else if (native_os == .linux) {
const pid: posix.pid_t = p: { const pid: posix.pid_t = p: {
const cached_pid = @atomicLoad(Pid, &t.pid, .monotonic); const cached_pid = @atomicLoad(Pid, &t.pid, .monotonic);
if (cached_pid != .unknown) break :p @intFromEnum(cached_pid); if (cached_pid != .unknown) break :p @intFromEnum(cached_pid);
const pid = std.os.linux.getpid(); const pid = std.os.linux.getpid();
@atomicStore(Pid, &t.pid, @enumFromInt(pid), .monotonic); @atomicStore(Pid, &t.pid, @enumFromInt(pid), .monotonic);
break :p pid; break :p pid;
};
if (std.os.linux.tgkill(pid, @bitCast(signal_id), .IO) != 0) return;
} else {
return;
}
var timespec: posix.timespec = .{
.sec = 0,
.nsec = @as(isize, 1) << @intCast(attempt_index),
}; };
if (native_os == .linux) { if (std.os.linux.tgkill(pid, @bitCast(signal_id), .IO) != 0) return;
_ = std.os.linux.clock_nanosleep(posix.CLOCK.MONOTONIC, .{ .ABSTIME = false }, &timespec, &timespec); } else {
} else { return;
_ = posix.system.nanosleep(&timespec, &timespec);
}
switch (@atomicRmw(CancelStatus, &closure.cancel_status, .Xchg, .requested, .monotonic).unpack()) {
.requested => continue, // Retry needed in case other thread hasn't yet entered the syscall.
.none, .acknowledged => return,
.signal_id => |new_signal_id| signal_id = new_signal_id,
}
} }
} }
}; };
@ -303,7 +281,7 @@ pub fn init(
.mask = posix.sigemptyset(), .mask = posix.sigemptyset(),
.flags = 0, .flags = 0,
}; };
if (have_sig_io) posix.sigaction(.IO, &act, &t.old_sig_io); if (!is_musl and have_sig_io) posix.sigaction(.IO, &act, &t.old_sig_io);
if (have_sig_pipe) posix.sigaction(.PIPE, &act, &t.old_sig_pipe); if (have_sig_pipe) posix.sigaction(.PIPE, &act, &t.old_sig_pipe);
t.have_signal_handler = true; t.have_signal_handler = true;
} }
@ -341,7 +319,7 @@ pub fn deinit(t: *Threaded) void {
if (ws2_32.WSACleanup() != 0) recoverableOsBugDetected(); if (ws2_32.WSACleanup() != 0) recoverableOsBugDetected();
} }
if (posix.Sigaction != void and t.have_signal_handler) { if (posix.Sigaction != void and t.have_signal_handler) {
if (have_sig_io) posix.sigaction(.IO, &t.old_sig_io, null); if (!is_musl and have_sig_io) posix.sigaction(.IO, &t.old_sig_io, null);
if (have_sig_pipe) posix.sigaction(.PIPE, &t.old_sig_pipe, null); if (have_sig_pipe) posix.sigaction(.PIPE, &t.old_sig_pipe, null);
} }
t.* = undefined; t.* = undefined;