std.fs.File.Reader: fix sendFile logic

it wasn't accounting for both writer and reader buffering
This commit is contained in:
Andrew Kelley 2025-07-21 20:00:45 -07:00
parent b35c55e237
commit 96cbdd145d
3 changed files with 128 additions and 299 deletions

View file

@ -10497,9 +10497,9 @@ pub const sysconf = switch (native_os) {
pub const sf_hdtr = switch (native_os) {
.freebsd, .macos, .ios, .tvos, .watchos, .visionos => extern struct {
headers: [*]const iovec_const,
headers: ?[*]const iovec_const,
hdr_cnt: c_int,
trailers: [*]const iovec_const,
trailers: ?[*]const iovec_const,
trl_cnt: c_int,
},
else => void,

View file

@ -1435,7 +1435,7 @@ pub const Reader = struct {
}
return 0;
};
const n = @min(size - pos, std.math.maxInt(i64), @intFromEnum(limit));
const n = @min(size - pos, maxInt(i64), @intFromEnum(limit));
file.seekBy(n) catch |err| {
r.seek_err = err;
return 0;
@ -1726,18 +1726,123 @@ pub const Writer = struct {
file_reader: *Reader,
limit: std.io.Limit,
) std.io.Writer.FileError!usize {
const reader_buffered = file_reader.interface.buffered();
if (reader_buffered.len >= @intFromEnum(limit))
return sendFileBuffered(io_w, file_reader, reader_buffered);
const writer_buffered = io_w.buffered();
const file_limit = @intFromEnum(limit) - reader_buffered.len;
const w: *Writer = @alignCast(@fieldParentPtr("interface", io_w));
const out_fd = w.file.handle;
const in_fd = file_reader.file.handle;
// TODO try using copy_file_range on FreeBSD
// TODO try using sendfile on macOS
// TODO try using sendfile on FreeBSD
if (native_os == .freebsd and w.mode == .streaming) sf: {
// Try using sendfile on FreeBSD.
if (w.sendfile_err != null) break :sf;
const offset = std.math.cast(std.c.off_t, file_reader.pos) orelse break :sf;
var hdtr_data: std.c.sf_hdtr = undefined;
var headers: [2]posix.iovec_const = undefined;
var headers_i: u8 = 0;
if (writer_buffered.len != 0) {
headers[headers_i] = .{ .base = writer_buffered.ptr, .len = writer_buffered.len };
headers_i += 1;
}
if (reader_buffered.len != 0) {
headers[headers_i] = .{ .base = reader_buffered.ptr, .len = reader_buffered.len };
headers_i += 1;
}
const hdtr: ?*std.c.sf_hdtr = if (headers_i == 0) null else b: {
hdtr_data = .{
.headers = &headers,
.hdr_cnt = headers_i,
.trailers = null,
.trl_cnt = 0,
};
break :b &hdtr_data;
};
var sbytes: std.c.off_t = undefined;
const nbytes: usize = @min(file_limit, maxInt(usize));
const flags = 0;
switch (posix.errno(std.c.sendfile(in_fd, out_fd, offset, nbytes, hdtr, &sbytes, flags))) {
.SUCCESS, .INTR => {},
.INVAL, .OPNOTSUPP, .NOTSOCK, .NOSYS => w.sendfile_err = error.UnsupportedOperation,
.BADF => if (builtin.mode == .Debug) @panic("race condition") else {
w.sendfile_err = error.Unexpected;
},
.FAULT => if (builtin.mode == .Debug) @panic("segmentation fault") else {
w.sendfile_err = error.Unexpected;
},
.NOTCONN => w.sendfile_err = error.BrokenPipe,
.AGAIN, .BUSY => if (sbytes == 0) {
w.sendfile_err = error.WouldBlock;
},
.IO => w.sendfile_err = error.InputOutput,
.PIPE => w.sendfile_err = error.BrokenPipe,
.NOBUFS => w.sendfile_err = error.SystemResources,
else => |err| w.sendfile_err = posix.unexpectedErrno(err),
}
const consumed = io_w.consume(@bitCast(sbytes));
file_reader.seekTo(file_reader.pos + consumed) catch return error.ReadFailed;
return consumed;
}
if (native_os.isDarwin() and w.mode == .streaming) sf: {
// Try using sendfile on macOS.
if (w.sendfile_err != null) break :sf;
const offset = std.math.cast(std.c.off_t, file_reader.pos) orelse break :sf;
var hdtr_data: std.c.sf_hdtr = undefined;
var headers: [2]posix.iovec_const = undefined;
var headers_i: u8 = 0;
if (writer_buffered.len != 0) {
headers[headers_i] = .{ .base = writer_buffered.ptr, .len = writer_buffered.len };
headers_i += 1;
}
if (reader_buffered.len != 0) {
headers[headers_i] = .{ .base = reader_buffered.ptr, .len = reader_buffered.len };
headers_i += 1;
}
const hdtr: ?*std.c.sf_hdtr = if (headers_i == 0) null else b: {
hdtr_data = .{
.headers = &headers,
.hdr_cnt = headers_i,
.trailers = null,
.trl_cnt = 0,
};
break :b &hdtr_data;
};
const max_count = maxInt(i32); // Avoid EINVAL.
var sbytes: std.c.off_t = @min(file_limit, max_count);
const flags = 0;
switch (posix.errno(std.c.sendfile(in_fd, out_fd, offset, &sbytes, hdtr, flags))) {
.SUCCESS, .INTR => {},
.OPNOTSUPP, .NOTSOCK, .NOSYS => w.sendfile_err = error.UnsupportedOperation,
.BADF => if (builtin.mode == .Debug) @panic("race condition") else {
w.sendfile_err = error.Unexpected;
},
.FAULT => if (builtin.mode == .Debug) @panic("segmentation fault") else {
w.sendfile_err = error.Unexpected;
},
.INVAL => if (builtin.mode == .Debug) @panic("invalid API usage") else {
w.sendfile_err = error.Unexpected;
},
.NOTCONN => w.sendfile_err = error.BrokenPipe,
.AGAIN => if (sbytes == 0) {
w.sendfile_err = error.WouldBlock;
},
.IO => w.sendfile_err = error.InputOutput,
.PIPE => w.sendfile_err = error.BrokenPipe,
else => |err| w.sendfile_err = posix.unexpectedErrno(err),
}
const consumed = io_w.consume(@bitCast(sbytes));
file_reader.seekTo(file_reader.pos + consumed) catch return error.ReadFailed;
return consumed;
}
if (native_os == .linux and w.mode == .streaming) sf: {
// Try using sendfile on Linux.
if (w.sendfile_err != null) break :sf;
// Linux sendfile does not support headers.
const buffered = limit.slice(file_reader.interface.buffer);
if (io_w.end != 0 or buffered.len != 0) return drain(io_w, &.{buffered}, 1);
if (writer_buffered.len != 0 or reader_buffered.len != 0)
return sendFileBuffered(io_w, file_reader, reader_buffered);
const max_count = 0x7ffff000; // Avoid EINVAL.
var off: std.os.linux.off_t = undefined;
const off_ptr: ?*std.os.linux.off_t, const count: usize = switch (file_reader.mode) {
@ -1784,6 +1889,7 @@ pub const Writer = struct {
w.pos += n;
return n;
}
const copy_file_range = switch (native_os) {
.freebsd => std.os.freebsd.copy_file_range,
.linux => if (std.c.versionCheck(.{ .major = 2, .minor = 27, .patch = 0 })) std.os.linux.wrapped.copy_file_range else {},
@ -1791,8 +1897,8 @@ pub const Writer = struct {
};
if (@TypeOf(copy_file_range) != void) cfr: {
if (w.copy_file_range_err != null) break :cfr;
const buffered = limit.slice(file_reader.interface.buffer);
if (io_w.end != 0 or buffered.len != 0) return drain(io_w, &.{buffered}, 1);
if (writer_buffered.len != 0 or reader_buffered.len != 0)
return sendFileBuffered(io_w, file_reader, reader_buffered);
var off_in: i64 = undefined;
var off_out: i64 = undefined;
const off_in_ptr: ?*i64 = switch (file_reader.mode) {
@ -1832,6 +1938,8 @@ pub const Writer = struct {
if (w.pos != 0) break :fcf;
if (limit != .unlimited) break :fcf;
const size = file_reader.getSize() catch break :fcf;
if (writer_buffered.len != 0 or reader_buffered.len != 0)
return sendFileBuffered(io_w, file_reader, reader_buffered);
const rc = std.c.fcopyfile(in_fd, out_fd, null, .{ .DATA = true });
switch (posix.errno(rc)) {
.SUCCESS => {},
@ -1860,6 +1968,16 @@ pub const Writer = struct {
return error.Unimplemented;
}
fn sendFileBuffered(
io_w: *std.io.Writer,
file_reader: *Reader,
reader_buffered: []const u8,
) std.io.Writer.FileError!usize {
const n = try drain(io_w, &.{reader_buffered}, 1);
file_reader.seekTo(file_reader.pos + n) catch return error.ReadFailed;
return n;
}
pub fn seekTo(w: *Writer, offset: u64) SeekError!void {
switch (w.mode) {
.positional, .positional_reading => {

View file

@ -6326,295 +6326,6 @@ pub fn send(
};
}
pub const SendFileError = PReadError || WriteError || SendError;
/// Transfer data between file descriptors, with optional headers and trailers.
///
/// Returns the number of bytes written, which can be zero.
///
/// The `sendfile` call copies `in_len` bytes from one file descriptor to another. When possible,
/// this is done within the operating system kernel, which can provide better performance
/// characteristics than transferring data from kernel to user space and back, such as with
/// `read` and `write` calls. When `in_len` is `0`, it means to copy until the end of the input file has been
/// reached. Note, however, that partial writes are still possible in this case.
///
/// `in_fd` must be a file descriptor opened for reading, and `out_fd` must be a file descriptor
/// opened for writing. They may be any kind of file descriptor; however, if `in_fd` is not a regular
/// file system file, it may cause this function to fall back to calling `read` and `write`, in which case
/// atomicity guarantees no longer apply.
///
/// Copying begins reading at `in_offset`. The input file descriptor seek position is ignored and not updated.
/// If the output file descriptor has a seek position, it is updated as bytes are written. When
/// `in_offset` is past the end of the input file, it successfully reads 0 bytes.
///
/// `flags` has different meanings per operating system; refer to the respective man pages.
///
/// These systems support atomically sending everything, including headers and trailers:
/// * macOS
/// * FreeBSD
///
/// These systems support in-kernel data copying, but headers and trailers are not sent atomically:
/// * Linux
///
/// Other systems fall back to calling `read` / `write`.
///
/// Linux has a limit on how many bytes may be transferred in one `sendfile` call, which is `0x7ffff000`
/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as
/// well as stuffing the errno codes into the last `4096` values. This is noted on the `sendfile` man page.
/// The limit on Darwin is `0x7fffffff`, trying to write more than that returns EINVAL.
/// The corresponding POSIX limit on this is `maxInt(isize)`.
pub fn sendfile(
out_fd: fd_t,
in_fd: fd_t,
in_offset: u64,
in_len: u64,
headers: []const iovec_const,
trailers: []const iovec_const,
flags: u32,
) SendFileError!usize {
var header_done = false;
var total_written: usize = 0;
// Prevents EOVERFLOW.
const size_t = std.meta.Int(.unsigned, @typeInfo(usize).int.bits - 1);
const max_count = switch (native_os) {
.linux => 0x7ffff000,
.macos, .ios, .watchos, .tvos, .visionos => maxInt(i32),
else => maxInt(size_t),
};
switch (native_os) {
.linux => sf: {
if (headers.len != 0) {
const amt = try writev(out_fd, headers);
total_written += amt;
if (amt < count_iovec_bytes(headers)) return total_written;
header_done = true;
}
// Here we match BSD behavior, making a zero count value send as many bytes as possible.
const adjusted_count = if (in_len == 0) max_count else @min(in_len, max_count);
const sendfile_sym = if (lfs64_abi) system.sendfile64 else system.sendfile;
while (true) {
var offset: off_t = @bitCast(in_offset);
const rc = sendfile_sym(out_fd, in_fd, &offset, adjusted_count);
switch (errno(rc)) {
.SUCCESS => {
const amt: usize = @bitCast(rc);
total_written += amt;
if (in_len == 0 and amt == 0) {
// We have detected EOF from `in_fd`.
break;
} else if (amt < in_len) {
return total_written;
} else {
break;
}
},
.BADF => unreachable, // Always a race condition.
.FAULT => unreachable, // Segmentation fault.
.OVERFLOW => unreachable, // We avoid passing too large of a `count`.
.NOTCONN => return error.BrokenPipe, // `out_fd` is an unconnected socket
.INVAL => {
// EINVAL could be any of the following situations:
// * Descriptor is not valid or locked
// * an mmap(2)-like operation is not available for in_fd
// * count is negative
// * out_fd has the APPEND flag set
// Because of the "mmap(2)-like operation" possibility, we fall back to doing read/write
// manually.
break :sf;
},
.AGAIN => return error.WouldBlock,
.IO => return error.InputOutput,
.PIPE => return error.BrokenPipe,
.NOMEM => return error.SystemResources,
.NXIO => return error.Unseekable,
.SPIPE => return error.Unseekable,
else => |err| {
unexpectedErrno(err) catch {};
break :sf;
},
}
}
if (trailers.len != 0) {
total_written += try writev(out_fd, trailers);
}
return total_written;
},
.freebsd => sf: {
var hdtr_data: std.c.sf_hdtr = undefined;
var hdtr: ?*std.c.sf_hdtr = null;
if (headers.len != 0 or trailers.len != 0) {
// Here we carefully avoid `@intCast` by returning partial writes when
// too many io vectors are provided.
const hdr_cnt = cast(u31, headers.len) orelse maxInt(u31);
if (headers.len > hdr_cnt) return writev(out_fd, headers);
const trl_cnt = cast(u31, trailers.len) orelse maxInt(u31);
hdtr_data = std.c.sf_hdtr{
.headers = headers.ptr,
.hdr_cnt = hdr_cnt,
.trailers = trailers.ptr,
.trl_cnt = trl_cnt,
};
hdtr = &hdtr_data;
}
while (true) {
var sbytes: off_t = undefined;
const err = errno(system.sendfile(in_fd, out_fd, @bitCast(in_offset), @min(in_len, max_count), hdtr, &sbytes, flags));
const amt: usize = @bitCast(sbytes);
switch (err) {
.SUCCESS => return amt,
.BADF => unreachable, // Always a race condition.
.FAULT => unreachable, // Segmentation fault.
.NOTCONN => return error.BrokenPipe, // `out_fd` is an unconnected socket
.INVAL, .OPNOTSUPP, .NOTSOCK, .NOSYS => {
// EINVAL could be any of the following situations:
// * The fd argument is not a regular file.
// * The s argument is not a SOCK.STREAM type socket.
// * The offset argument is negative.
// Because of some of these possibilities, we fall back to doing read/write
// manually, the same as ENOSYS.
break :sf;
},
.INTR => if (amt != 0) return amt else continue,
.AGAIN => if (amt != 0) {
return amt;
} else {
return error.WouldBlock;
},
.BUSY => if (amt != 0) {
return amt;
} else {
return error.WouldBlock;
},
.IO => return error.InputOutput,
.NOBUFS => return error.SystemResources,
.PIPE => return error.BrokenPipe,
else => {
unexpectedErrno(err) catch {};
if (amt != 0) {
return amt;
} else {
break :sf;
}
},
}
}
},
.macos, .ios, .tvos, .watchos, .visionos => sf: {
var hdtr_data: std.c.sf_hdtr = undefined;
var hdtr: ?*std.c.sf_hdtr = null;
if (headers.len != 0 or trailers.len != 0) {
// Here we carefully avoid `@intCast` by returning partial writes when
// too many io vectors are provided.
const hdr_cnt = cast(u31, headers.len) orelse maxInt(u31);
if (headers.len > hdr_cnt) return writev(out_fd, headers);
const trl_cnt = cast(u31, trailers.len) orelse maxInt(u31);
hdtr_data = std.c.sf_hdtr{
.headers = headers.ptr,
.hdr_cnt = hdr_cnt,
.trailers = trailers.ptr,
.trl_cnt = trl_cnt,
};
hdtr = &hdtr_data;
}
while (true) {
var sbytes: off_t = @min(in_len, max_count);
const err = errno(system.sendfile(in_fd, out_fd, @bitCast(in_offset), &sbytes, hdtr, flags));
const amt: usize = @bitCast(sbytes);
switch (err) {
.SUCCESS => return amt,
.BADF => unreachable, // Always a race condition.
.FAULT => unreachable, // Segmentation fault.
.INVAL => unreachable,
.NOTCONN => return error.BrokenPipe, // `out_fd` is an unconnected socket
.OPNOTSUPP, .NOTSOCK, .NOSYS => break :sf,
.INTR => if (amt != 0) return amt else continue,
.AGAIN => if (amt != 0) {
return amt;
} else {
return error.WouldBlock;
},
.IO => return error.InputOutput,
.PIPE => return error.BrokenPipe,
else => {
unexpectedErrno(err) catch {};
if (amt != 0) {
return amt;
} else {
break :sf;
}
},
}
}
},
else => {}, // fall back to read/write
}
if (headers.len != 0 and !header_done) {
const amt = try writev(out_fd, headers);
total_written += amt;
if (amt < count_iovec_bytes(headers)) return total_written;
}
rw: {
var buf: [8 * 4096]u8 = undefined;
// Here we match BSD behavior, making a zero count value send as many bytes as possible.
const adjusted_count = if (in_len == 0) buf.len else @min(buf.len, in_len);
const amt_read = try pread(in_fd, buf[0..adjusted_count], in_offset);
if (amt_read == 0) {
if (in_len == 0) {
// We have detected EOF from `in_fd`.
break :rw;
} else {
return total_written;
}
}
const amt_written = try write(out_fd, buf[0..amt_read]);
total_written += amt_written;
if (amt_written < in_len or in_len == 0) return total_written;
}
if (trailers.len != 0) {
total_written += try writev(out_fd, trailers);
}
return total_written;
}
fn count_iovec_bytes(iovs: []const iovec_const) usize {
var count: usize = 0;
for (iovs) |iov| {
count += iov.len;
}
return count;
}
pub const CopyFileRangeError = error{
FileTooBig,
InputOutput,