std.process.Child: add waitForSpawn

`std.Build.Step.Run` makes the very reasonable assumption that
`error.InvalidExe` will be reported on `spawn` if it will happen.
However, this property does not currently hold on POSIX targets. This
is, through a slightly convoluted series of events, partially
responsible for the sporadic `BrokenPipe` errors we've been seeing more
and more in CI runs.

Making `spawn` wait for the child to exec in the POSIX path introduces
a block of up to 400us. So, instead of doing that, we add a new API for
this particular case: `waitForSpawn`. This function is a nop on Windows,
but on POSIX it blocks until the child successfully (or otherwise) calls
`execvpe`, and reports the error if necessary. `std.Build.Step.Run`
calls this function, so that it can get `error.InvalidExe` when it wants
it.

I'm not convinced that this API is optimal. However, I think this entire
API needs to be either heavily refactored or straight-up redesigned
(related: #22504), so I'm not too worried about hitting the perfect API:
I'd rather just fix this bug for now, and figure out the long-term goal
a bit later.
This commit is contained in:
mlugg 2025-01-10 10:27:32 +00:00
parent b8e568504e
commit 048e85f27e
No known key found for this signature in database
GPG Key ID: 3F5B7DCCBF4AF02E
2 changed files with 50 additions and 80 deletions

View File

@ -1354,6 +1354,9 @@ fn spawnChildAndCollect(
_ = child.kill() catch {};
}
// We need to report `error.InvalidExe` *now* if applicable.
try child.waitForSpawn();
var timer = try std.time.Timer.start();
const result = if (run.stdio == .zig_test)

View File

@ -73,7 +73,7 @@ cwd: ?[]const u8,
/// Once that is done, `cwd` will be deprecated in favor of this field.
cwd_dir: ?fs.Dir = null,
err_pipe: ?if (native_os == .windows) void else [2]posix.fd_t,
err_pipe: if (native_os == .windows) void else ?posix.fd_t,
expand_arg0: Arg0Expand,
@ -211,7 +211,7 @@ pub fn init(argv: []const []const u8, allocator: mem.Allocator) ChildProcess {
.argv = argv,
.id = undefined,
.thread_handle = undefined,
.err_pipe = null,
.err_pipe = if (native_os == .windows) {} else null,
.term = null,
.env_map = null,
.cwd = null,
@ -293,17 +293,49 @@ pub fn killPosix(self: *ChildProcess) !Term {
error.ProcessNotFound => return error.AlreadyTerminated,
else => return err,
};
self.waitUnwrapped();
self.waitUnwrappedPosix();
return self.term.?;
}
pub const WaitError = SpawnError || std.os.windows.GetProcessMemoryInfoError;
/// On some targets, `spawn` may not report all spawn errors, such as `error.InvalidExe`.
/// This function will block until any spawn errors can be reported, and return them.
pub fn waitForSpawn(self: *ChildProcess) SpawnError!void {
if (native_os == .windows) return; // `spawn` reports everything
if (self.term) |term| {
_ = term catch |spawn_err| return spawn_err;
return;
}
const err_pipe = self.err_pipe orelse return;
self.err_pipe = null;
// Wait for the child to report any errors in or before `execvpe`.
if (readIntFd(err_pipe)) |child_err_int| {
posix.close(err_pipe);
const child_err: SpawnError = @errorCast(@errorFromInt(child_err_int));
self.term = child_err;
return child_err;
} else |_| {
// Write end closed by CLOEXEC at the time of the `execvpe` call, indicating success!
posix.close(err_pipe);
}
}
/// Blocks until child process terminates and then cleans up all resources.
pub fn wait(self: *ChildProcess) WaitError!Term {
const term = if (native_os == .windows) try self.waitWindows() else self.waitPosix();
try self.waitForSpawn(); // report spawn errors
if (self.term) |term| {
self.cleanupStreams();
return term;
}
switch (native_os) {
.windows => try self.waitUnwrappedWindows(),
else => self.waitUnwrappedPosix(),
}
self.id = undefined;
return term;
return self.term.?;
}
pub const RunResult = struct {
@ -405,26 +437,6 @@ pub fn run(args: struct {
};
}
fn waitWindows(self: *ChildProcess) WaitError!Term {
if (self.term) |term| {
self.cleanupStreams();
return term;
}
try self.waitUnwrappedWindows();
return self.term.?;
}
fn waitPosix(self: *ChildProcess) SpawnError!Term {
if (self.term) |term| {
self.cleanupStreams();
return term;
}
self.waitUnwrapped();
return self.term.?;
}
fn waitUnwrappedWindows(self: *ChildProcess) WaitError!void {
const result = windows.WaitForSingleObjectEx(self.id, windows.INFINITE, false);
@ -447,7 +459,7 @@ fn waitUnwrappedWindows(self: *ChildProcess) WaitError!void {
return result;
}
fn waitUnwrapped(self: *ChildProcess) void {
fn waitUnwrappedPosix(self: *ChildProcess) void {
const res: posix.WaitPidResult = res: {
if (self.request_resource_usage_statistics) {
switch (native_os) {
@ -469,7 +481,7 @@ fn waitUnwrapped(self: *ChildProcess) void {
}
fn handleWaitResult(self: *ChildProcess, status: u32) void {
self.term = self.cleanupAfterWait(status);
self.term = statusToTerm(status);
}
fn cleanupStreams(self: *ChildProcess) void {
@ -487,46 +499,6 @@ fn cleanupStreams(self: *ChildProcess) void {
}
}
fn cleanupAfterWait(self: *ChildProcess, status: u32) !Term {
if (self.err_pipe) |err_pipe| {
defer destroyPipe(err_pipe);
if (native_os == .linux) {
var fd = [1]posix.pollfd{posix.pollfd{
.fd = err_pipe[0],
.events = posix.POLL.IN,
.revents = undefined,
}};
// Check if the eventfd buffer stores a non-zero value by polling
// it, that's the error code returned by the child process.
_ = posix.poll(&fd, 0) catch unreachable;
// According to eventfd(2) the descriptor is readable if the counter
// has a value greater than 0
if ((fd[0].revents & posix.POLL.IN) != 0) {
const err_int = try readIntFd(err_pipe[0]);
return @as(SpawnError, @errorCast(@errorFromInt(err_int)));
}
} else {
// Write maxInt(ErrInt) to the write end of the err_pipe. This is after
// waitpid, so this write is guaranteed to be after the child
// pid potentially wrote an error. This way we can do a blocking
// read on the error pipe and either get maxInt(ErrInt) (no error) or
// an error code.
try writeIntFd(err_pipe[1], maxInt(ErrInt));
const err_int = try readIntFd(err_pipe[0]);
// Here we potentially return the fork child's error from the parent
// pid.
if (err_int != maxInt(ErrInt)) {
return @as(SpawnError, @errorCast(@errorFromInt(err_int)));
}
}
}
return statusToTerm(status);
}
fn statusToTerm(status: u32) Term {
return if (posix.W.IFEXITED(status))
Term{ .Exited = posix.W.EXITSTATUS(status) }
@ -636,18 +608,9 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void {
}
};
// This pipe is used to communicate errors between the time of fork
// and execve from the child process to the parent process.
const err_pipe = blk: {
if (native_os == .linux) {
const fd = try posix.eventfd(0, linux.EFD.CLOEXEC);
// There's no distinction between the readable and the writeable
// end with eventfd
break :blk [2]posix.fd_t{ fd, fd };
} else {
break :blk try posix.pipe2(.{ .CLOEXEC = true });
}
};
// This pipe communicates to the parent errors in the child between `fork` and `execvpe`.
// It is closed by the child (via CLOEXEC) without writing if `execvpe` succeeds.
const err_pipe: [2]posix.fd_t = try posix.pipe2(.{ .CLOEXEC = true });
errdefer destroyPipe(err_pipe);
const pid_result = try posix.fork();
@ -687,6 +650,11 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void {
}
// we are the parent
errdefer comptime unreachable; // The child is forked; we must not error from now on
posix.close(err_pipe[1]); // make sure only the child holds the write end open
self.err_pipe = err_pipe[0];
const pid: i32 = @intCast(pid_result);
if (self.stdin_behavior == .Pipe) {
self.stdin = .{ .handle = stdin_pipe[1] };
@ -705,7 +673,6 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void {
}
self.id = pid;
self.err_pipe = err_pipe;
self.term = null;
if (self.stdin_behavior == .Pipe) {