This adapts programs using sd_notify for use with s6 readiness notification. Signed-off-by: Demi Marie Obenour <demiobenour@gmail.com> --- The amount of code in sd-notify-adapter is significantly less than that of s6-supervise itself. Furthermore, it duplicates a lot of the functionality in that tool. Therefore, I think that s6-supervise is actually the correct place for this. In particular, this allows reliable reloading of services: send a SIGHUP and wait for RELOADING=1 and then READY=1. --- tools/default.nix | 1 + tools/meson.build | 1 + tools/sd-notify-adapter/meson.build | 4 + tools/sd-notify-adapter/sd-notify-adapter.c | 490 ++++++++++++++++++++++++++++ 4 files changed, 496 insertions(+) diff --git a/tools/default.nix b/tools/default.nix index 95d76a12c5d2cdc0f47134f36dc8ac326bd8aff1..88556e6fbae6584d03e67da5b74fbac306acb16e 100644 --- a/tools/default.nix +++ b/tools/default.nix @@ -70,6 +70,7 @@ stdenv.mkDerivation (finalAttrs: { ./lsvm ./start-vmm ./subprojects + ./sd-notify-adapter ])); }; sourceRoot = "source/tools"; diff --git a/tools/meson.build b/tools/meson.build index 9cebd03e323531fca7600cacf120161a98de16c5..91cb54e4e7472f8fc225bc75b82442d5600a3a9c 100644 --- a/tools/meson.build +++ b/tools/meson.build @@ -21,6 +21,7 @@ if get_option('host') subdir('lsvm') subdir('start-vmm') + subdir('sd-notify-adapter') endif if get_option('guest') diff --git a/tools/sd-notify-adapter/meson.build b/tools/sd-notify-adapter/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..6032a3a7704d49cae0655b43d0189444d3b15e4d --- /dev/null +++ b/tools/sd-notify-adapter/meson.build @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: ISC +# SPDX-FileCopyrightText: 2025 Demi Marie Obenour <demiobenour@gmail.com> + +executable('sd-notify-adapter', 'sd-notify-adapter.c', install: true) diff --git a/tools/sd-notify-adapter/sd-notify-adapter.c b/tools/sd-notify-adapter/sd-notify-adapter.c new file mode 100644 index 0000000000000000000000000000000000000000..d5356e2957bad8a15a5dd45e32db6bbac915c2c6 --- /dev/null +++ b/tools/sd-notify-adapter/sd-notify-adapter.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: ISC +// SPDX-FileCopyrightText: 2025 Demi Marie Obenour <demiobenour@gmail.com> +// check_posix and check_posix_bool are based on code with following license: +// +// Copyright 2014 Daniel Micay +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define _GNU_SOURCE 1 +#include <assert.h> +#include <errno.h> +#include <limits.h> +#include <signal.h> +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <getopt.h> +#include <fcntl.h> +#include <sys/syscall.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/un.h> +#include <sys/wait.h> +#include <unistd.h> +#include <sysexits.h> +#include <err.h> +#include <poll.h> + +/* TODO: does this need to have credit given to Daniel Micay? */ +__attribute__((format(printf, 2, 3))) +intmax_t check_posix(intmax_t arg, const char *fmt, ...) { + if (arg >= 0) + return arg; + assert(arg == -1); + va_list a; + va_start(a, fmt); + verr(EX_OSERR, fmt, a); + __builtin_unreachable(); +} + +#define check_posix(arg, message, ...) \ + ((__typeof__(arg))check_posix(arg, message, ## __VA_ARGS__)) + +/* And same here */ +__attribute__((format(printf, 2, 3))) +void check_posix_bool(intmax_t arg, const char *fmt, ...) { + if (arg != -1) { + assert(arg == 0); + return; + } + va_list a; + va_start(a, fmt); + verr(EX_OSERR, fmt, a); + va_end(a); /* not reached */ +} + +/* And same here */ +void check_posix_bool_no_atexit(intmax_t arg, const char *msg) { + if (arg != -1) { + assert(arg == 0); + return; + } + perror(msg); + _exit(EX_OSERR); +} + +static volatile siginfo_t sig_info; +static pid_t child_pid; + +/* Interrupts a call to ppoll(), which is AS-safe */ +static void handler(int signum, siginfo_t *info, void *data) +{ + (void)data; + switch (signum) { + case SIGCHLD: + switch (info->si_code) { + case CLD_EXITED: + case CLD_KILLED: + case CLD_DUMPED: + if (info->si_pid == child_pid) + sig_info = *info; + break; + } + break; + case SIGTERM: + case SIGINT: + case SIGWINCH: + case SIGHUP: + case SIGUSR1: + case SIGUSR2: + case SIGQUIT: + kill(child_pid, signum); + break; + default: + abort(); + } +} + +/* too_low is 1 below the lower bound; this prevents the final negation + * from overflowing if too_low and the result are INT_MIN */ +static int parse_int(const char *arg, int too_low) { + char *end = (char *)arg; + if (*arg == '0') { + if (arg[1] == '\0' && too_low < 0) + return 0; + return INT_MIN; + } + int negative = *arg == '-'; + if (negative) { + arg++; + } + if (*arg < '1' || *arg > '9') { + return INT_MIN; + } + errno = 0; + long v = strtol(arg, &end, 10); + if (v <= too_low || v > INT_MAX || errno || *end != '\0') { + return INT_MIN; + } + return negative ? -(int)v : (int)v; +} + +const struct option longopts[] = { + { "oom-score-adj", required_argument, NULL, 'o' }, + { "help", no_argument, NULL, 'h' }, + { NULL, 0, NULL, 0 }, +}; + +static _Noreturn void usage(int arg) { + errx(arg, "Usage: notification-fd [[-o|--oom-score-adj] adjustment] -- notify-socket program argv0 args...\n"); +} + +#if O_RDONLY != 0 || O_WRONLY != 1 || O_RDWR != 2 +# error unsupported O_* constants +#endif + +static void check_fd_usable(int fd, bool writable) { + int raw_flags = fcntl(fd, F_GETFL); + if (raw_flags == -1) { + err(errno == EBADF ? EX_USAGE : EX_OSERR, "fcntl(%d, F_GETFD)", fd); + } + int flags = raw_flags & 3; + if (flags != O_RDWR && flags != (writable ? O_WRONLY : O_RDONLY)) { + errx(EX_USAGE, "File descriptor %d is not %s (flags 0x%x)", fd, writable ? "writable" : "readable", raw_flags); + } +} + +static void check_pipe(int fd) { + struct stat buf; + check_posix_bool(fstat(fd, &buf), "fstat"); + if (!S_ISFIFO(buf.st_mode)) { + errx(EX_USAGE, "notification fd %d is not a pipe", fd); + } + if (buf.st_mode & 066) { + errx(EX_USAGE, "notification fd %d is accessible by group or other", fd); + } +} + +static pid_t process_cmsg(struct msghdr *const msg) { + pid_t sender_pid = -1; + for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + size_t data_len = cmsg->cmsg_len - sizeof(struct cmsghdr); + if (cmsg->cmsg_level != SOL_SOCKET) { + continue; + } + if (cmsg->cmsg_type == SCM_RIGHTS) { + int received_fd; + for (size_t i = 0; data_len - i >= sizeof(received_fd); i += sizeof(received_fd)) { + memcpy(&received_fd, CMSG_DATA(cmsg) + i, sizeof(received_fd)); + (void)close(received_fd); + } + } + if (cmsg->cmsg_type == SCM_CREDENTIALS) { + struct ucred creds; + assert(data_len >= sizeof(creds)); + memcpy(&creds, CMSG_DATA(cmsg), sizeof(creds)); + sender_pid = creds.pid; + } + } + return sender_pid; +} +static pid_t own_pid; + +void kill_process_group(void) { + if (own_pid != -1) + kill(0, SIGKILL); +} + +int main(int argc, char **argv) { + own_pid = getpid(); + int oom_score_adj = INT_MIN; + if (argc < 1) { + errx(EX_USAGE, "argv[0] is NULL"); + } + if (own_pid < 2) { + errx(EX_USAGE, "cannot run as PID 1"); + } + for (int i = 0; i < 3; ++i) { + check_fd_usable(i, i != 0); + } + + for (;;) { + int longindex = -1; + const char *lastopt = argv[optind]; + int res = getopt_long(argc, argv, "+o:h", longopts, &longindex); + if (res == -1) { + if (argc - optind < 4) { + usage(EX_USAGE); + } + if (strcmp(lastopt, "--") != 0) { + errx(EX_USAGE, "no -- before non-option arguments"); + } + break; + } + if (res == '?') { + usage(EX_USAGE); + } + /* getopt_long accepts abbreviated options. Disable this misfeature. */ + if (lastopt[0] == '-' && lastopt[1] == '-') { + const char *optname = lastopt + 2; + assert(longindex >= 0 && longindex < (int)(sizeof(longopts)/sizeof(longopts[0]))); + const char *expected = longopts[longindex].name; + if (strncmp(expected, optname, strlen(expected)) != 0) { + char *equal = strchr(optname, '='); + errx(EX_USAGE, + "Option --%.*s must be written as --%s", + equal ? (int)(equal - optname) : INT_MAX, + optname, expected); + } + } + switch (res) { + case 'o': + oom_score_adj = parse_int(optarg, INT_MIN); + if (oom_score_adj < -1000 || oom_score_adj > 1000) { + errx(EX_USAGE, "Invalid OOM score adjustment %s", optarg); + } + break; + case 'h': + usage(0); + default: + assert(0); /* not reached */ + } + } + + union { + struct sockaddr_un un; + struct sockaddr addr; + } a = {}; + int notification_fd = parse_int(argv[optind], 2); + const char *const socket_path = argv[optind + 1]; + const char *const progname = argv[optind + 2]; + char **const args_to_exec = argv + optind + 3; + + if (notification_fd < 3) { + errx(EX_USAGE, "Invalid notification descriptor %s\n", argv[optind]); + } + check_fd_usable(notification_fd, true); + check_pipe(notification_fd); + check_posix_bool(chdir("/"), "chdir(/)"); + + mode_t old_mask = umask(0077); + size_t len = strlen(socket_path); + if (len >= sizeof(a.un.sun_path)) { + /* TODO: use stravis() */ + errx(EX_USAGE, "Path %s is too long", socket_path); + } + if (socket_path[0] != '/') { + /* TODO: use stravis() */ + errx(EX_USAGE, "Path %s is not absolute", socket_path); + } + + if (check_posix(getpgrp(), "getpgrp()") != own_pid) { + check_posix_bool(setsid(), "setsid"); + } + + memcpy(a.un.sun_path, socket_path, len + 1); + a.un.sun_family = AF_UNIX; + int fd = check_posix(socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0), "socket"); + { + int flag = 1; + check_posix_bool(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &flag, (socklen_t)sizeof(flag)), + "setsockopt(SO_REUSEADDR)"); + check_posix_bool(setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &flag, (socklen_t)sizeof(flag)), + "setsockopt(SO_PASSCRED)"); + } + for (;;) { + int status; + do { + status = bind(fd, &a.addr, (socklen_t)(len + 1 + offsetof(struct sockaddr_un, sun_path))); + } while (status == -1 && errno == EINTR); + if (!(status == -1 && errno == EADDRINUSE)) { + check_posix_bool(status, "bind(%s)", socket_path); + break; + } + check_posix_bool(unlink(socket_path), "unlink(%s)", socket_path); + } + umask(old_mask); + + /* TODO: support commands */ + sigset_t sigset; + sigemptyset(&sigset); + const int sigs[] = { SIGCHLD, SIGQUIT, SIGTERM, SIGINT, SIGWINCH, SIGHUP, SIGUSR1, SIGUSR2 }; + for (size_t i = 0; i < sizeof(sigs)/sizeof(sigs[0]); ++i) { + check_posix_bool(sigaddset(&sigset, sigs[i]), "sigaddset(%d)", sigs[i]); + } + check_posix_bool(sigprocmask(SIG_BLOCK, &sigset, NULL), "sigprocmask"); + struct sigaction act = { }; + /* systemd ignores SIGPIPE, so emulate this */ + act.sa_handler = SIG_IGN; + check_posix_bool(sigaction(SIGPIPE, &act, NULL), "sigaction(SIGPIPE)"); + /* add handlers */ + act.sa_sigaction = handler; + act.sa_mask = sigset; /* these are already blocked but that is harmless */ + act.sa_flags = SA_SIGINFO; + for (size_t i = 0; i < sizeof(sigs)/sizeof(sigs[0]); ++i) { + check_posix_bool(sigaction(sigs[i], &act, NULL), "sigaction(%d)", sigs[i]); + } + static_assert(EOF == -1, "bad EOF definition"); + check_posix_bool(fflush(NULL), "fflush"); + if (oom_score_adj != INT_MIN) { + char *p; + int fd = check_posix(open("/proc/self/oom_score_adj", + O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW), + "open(\"/proc/self/oom_score_adj\")"); + int to_write = check_posix(asprintf(&p, "%d\n", oom_score_adj), "asprintf"); + ssize_t written = check_posix(write(fd, p, (size_t)to_write), "write(\"/proc/self/oom_score_adj\")"); + assert(written == to_write); + free(p); + } + pid_t pid = fork(); + if (pid < 0) { + assert(pid == -1); + err(EX_OSERR, "fork()"); + } + child_pid = pid; + + if (atexit(kill_process_group)) { + errx(EX_OSERR, "atexit()"); + } + if (pid == 0) { + check_posix_bool_no_atexit(syscall(SYS_close_range, 3L, ~0UL, 0L), "close_range()"); + check_posix_bool_no_atexit(setenv("NOTIFY_SOCKET", a.un.sun_path, 1), "setenv"); + check_posix_bool_no_atexit(execve(progname, args_to_exec, environ), "execve"); + abort(); /* not reached */ + } + char buf[sizeof("RELOADING=1\n") - 1]; + struct iovec v[1] = { + { + .iov_base = buf, + .iov_len = sizeof(buf), + }, + }; + union { + struct cmsghdr hdr; + char buf[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int) * 253)]; + } cmsg_buffer; + + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = v, + .msg_iovlen = sizeof(v)/sizeof(v[0]), + .msg_control = cmsg_buffer.buf, + .msg_controllen = sizeof(cmsg_buffer.buf), + .msg_flags = 0, + }; + sigemptyset(&sigset); + + struct pollfd p[] = { + { + .fd = fd, + .events = POLLIN | POLLPRI | POLLRDHUP, + .revents = 0, + }, + }; + bool ready = false; + bool reloading = false; + + for (;;) { + /* Main event loop */ + if (sig_info.si_pid) { + int status; + int r = waitpid(sig_info.si_pid, &status, 0); + if (r == -1) + err(EX_OSERR, "waitpid(%jd)", (intmax_t)sig_info.si_pid); + if (WIFSIGNALED(status)) { + for (;;) { + (void)signal(WTERMSIG(status), SIG_DFL); + (void)kill(getpid(), WTERMSIG(status)); + } + } else if (WIFEXITED(status)) { + own_pid = -1; + return WEXITSTATUS(status); + } else { + abort(); /* cannot happen */ + } + } + int r = ppoll(p, sizeof(p)/sizeof(p[0]), NULL, &sigset); + if (r < -1 || r > (int)(sizeof(p)/sizeof(p[0]))) { + abort(); + } + if (r == -1) { + if (errno == ENOMEM) { + fprintf(stderr, "Kernel out of memory in ppoll()\n"); + continue; /* todo: use epoll(7) instead */ + } + if (errno == EINTR) { + fprintf(stderr, "ppoll() interrupted by signal\n"); + continue; + } + err(EX_OSERR, "poll"); + } + fprintf(stderr, "Returned from poll()\n"); + if (p[0].revents) { + ssize_t data = recvmsg(fd, &msg, MSG_CMSG_CLOEXEC | MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK); + if (data == -1) { + if (errno == EINTR) { + fprintf(stderr, "recvmsg() interrupted by signal"); + continue; /* signal caught */ + } + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fprintf(stderr, "ppoll() spurious wakeup\n"); + continue; /* spurious wakeup */ + } + } + size_t size = (size_t)check_posix(data, "recvmsg"); + pid_t sender_pid = process_cmsg(&msg); + if (msg.msg_flags & MSG_TRUNC) { + char *b = (v[0].iov_base == buf) ? malloc(size) : realloc(v[0].iov_base, size); + if (b != NULL) { + v[0].iov_base = b; + v[0].iov_len = size; + } + } + size = (size_t)check_posix(recvmsg(fd, &msg, MSG_CMSG_CLOEXEC | MSG_DONTWAIT | MSG_TRUNC), "recvmsg"); + sender_pid = process_cmsg(&msg); + if (sender_pid != child_pid) { + fprintf(stderr, "%jd cannot notify\n", (intmax_t)sender_pid); + continue; /* cannot notify */ + } + const char *cursor = v[0].iov_base; + const char *end = cursor + size; + for (;;) { + char *next = memchr(cursor, '\n', (size_t)(end - cursor)); + size_t message_size = (size_t)((next == NULL ? end : next) - cursor); + if (message_size == sizeof("READY=1") - 1 && memcmp(cursor, "READY=1", sizeof("READY=1") - 1) == 0) { + if (!ready) { + if (check_posix(write(notification_fd, "Ready\n", sizeof("Ready")), "write") != sizeof("Ready")) { + errx(EX_OSERR, "cannot notify parent of readiness"); + } + } + ready = true; + if (reloading) { + fprintf(stderr, "Configuration reload complete\n"); + } else { + fprintf(stderr, "Program ready\n"); + } + reloading = false; + } else if (message_size == sizeof("RELOADING=1") - 1 && memcmp(cursor, "RELOADING=1", sizeof("RELOADING=1") - 1) == 0) { + reloading = true; + } + if (next == NULL) { + break; + } + cursor = next + 1; + } + } + } +} -- 2.51.0