Re: [PATCH 1/3] tools/mount-flatpak: init

14 Nov 2025

On 11/14/25 06:12, Alyssa Ross wrote:
...
Demi Marie Obenour <demiobenour@gmail.com> writes:
...
On 11/13/25 07:04, Alyssa Ross wrote:
...

diff --git a/tools/mount-flatpak/mount-flatpak.c b/tools/mount-flatpak/mount-flatpak.c
new file mode 100644
index 0000000..8e09d1d
--- /dev/null
+++ b/tools/mount-flatpak/mount-flatpak.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: EUPL-1.2+
+// SPDX-FileCopyrightText: 2025 Alyssa Ross <hi@alyssa.is>
+
+#include "config.h"
+#include "metadata.h"
+
+#include <err.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/stat.h>
+#include <sys/syscall.h>
+
+#include <linux/mount.h>
+#include <linux/openat2.h>
+
+static void bind_mount(int source_fd, const char *source,
+                       int target_fd, const char *target)
+{
+	int source_tree = syscall(SYS_open_tree, source_fd, source,
+	                          AT_EMPTY_PATH | OPEN_TREE_CLOEXEC |
+	                          OPEN_TREE_CLONE | AT_RECURSIVE);
+	if (source_tree == -1)
+		err(EXIT_FAILURE, "open_tree %s", source);
+	if (syscall(SYS_move_mount, source_tree, "", target_fd, target,
+	            MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH) == -1)
+		err(EXIT_FAILURE, "move_mount");
Missing checks that target does not contain "/" and is not "." or "..".
Right, yes, move_mount doesn't have RESOLVE_BENEATH semantics.  Ideally
I suppose we can leave target empty and only use an fd, but I don't
think that works in all circumstances.
Which ones does it fail in?  Also, should this set the mount read-only?
...
...
...
+}
+
+static int mkdir_openat(int dirfd, const char *path, mode_t mode)
+{
+	int fd;
+	if (mkdirat(dirfd, path, mode) == -1)
+		err(EXIT_FAILURE, "mkdirat %s", path);
+	if ((fd = openat(dirfd, path, O_PATH | O_DIRECTORY | O_CLOEXEC)) == -1)
+		err(EXIT_FAILURE, "openat %s", path);
+	return fd;
+}
+
+// By failing on EEXIST when creating each directory,
+// we can be sure we don't end up following .. components.
+static int mkdirs_beneath(int root, const char *target)
+{
+	int last, fd = root;
Missing check for target being absolute.
if (*target == '/')
      errx(EXIT_FAILURE, "Path is absolute");
if (*target == '\0')
      errx(EXIT_FAILURE, "Path is empty");
Ugh, yes.  I have RESOLVE_BENEATH on the brain.  Sad that it's not
supported for more operations.  Empty path shouldn't be a problem though
I think?  It'd just mean 0 iterations of the loop run.
POSIX requires that an empty pathname is not successfully resolved,
and I'm pretty sure that for all of the callers an empty pathname
should be treated as an error.
...
...
...
+	size_t len = strlen(target);
+	char *end, *path = malloc(len + 1), *dir = path;
+	if (!dir)
+		err(EXIT_FAILURE, "malloc %zu", len + 1);
+	memcpy(dir, target, len + 1);
+
+	do {
+		// Find next non-empty directory component
+		end = strchrnul(dir, '/');
+		while (*(end + 1) == '/')
+			end++;
Off-by-1 overread: should check *end and not *(end + 1).
Also, this skips past the NUL terminator.
This fixes the bug and adds the missing "." and ".." checks:
end = strchrnul(dir, '/');
if (end - dir == 1 && dir[0] == '.')
      	errx(EXIT_FAILURE, "path component is '.'");
if (end - dir == 2 && dir[0] == '.' && dir[1] == '.')
      	errx(EXIT_FAILURE, "path component is '..'");
if (*end != '\0') {
      	while (end[1] == '/')
      		end++;
      } else {
      	end--;
      }
However, the `dir = end + 1` at the end of the loop can be removed
while making the code simpler.  I would write:
end = strchrnul(dir, '/');
if (end - dir == 1 && dir[0] == '.')
      	errx(EXIT_FAILURE, "path component is '.'");
if (end - dir == 2 && dir[0] == '.' && dir[1] == '.')
      	errx(EXIT_FAILURE, "path component is '..'");
if (*end != '\0') {
      	// Replace path separator with string terminator.
      	*end = '\0';
// Advance until end does not point to a '/'.
      	while (*++end == '/') {}
      }
This means that *end now points to the start of the next component or the
NUL terminator, so the following changes are needed.
- Removing `*end = 0`.
- Replacing `dir = end + 1` with `dir = end`.
Feel free to write the while loop in a different way if it is too
code-golfed :)
Not sure what I was thinking there, but I suppose that's what code
review is for.
...
...
+		// Replace path separator with string terminator.
+		*end = 0;
Missing check for a path component being "." or "..".
As I wrote in the comment, these should just fail with EEXIST, no?
They indeed should and do.  I was thinking of openat(), which does
not have this guarantee.
I do recommend having mkdir_openat() assert that the directory does not
contain '/', which would bypass the protections it is meant to provide.
...
...
...
+		// Update fd to a new child, and close the previous one
+		// unless it was the one provided by the caller.
+		last = fd;
+		fd = mkdir_openat(fd, dir, 0755);
+		if (last != root)
+			close(last);
+
+		dir = end + 1;
+	} while (end != &path[len]);
This is slightly fragile against bugs.  I generally prefer:
} while (end < path + len);
so that if end goes past the terminator the loop will exit.
Alternatively,
} while (*end != '\0');
could be used, but that also assumes the code never advances the
pointer past the NUL terminator.
If we've gone past the terminator something has already gone wrong, but
I don't mind changing != to <.> >>> +	free(path);
...
...
+	return fd;
+}
+
+static int openat_beneath(int dirfd, const char *path, int flags)
+{
+	struct open_how how = {
+		.flags = flags,
+		.resolve = RESOLVE_BENEATH | RESOLVE_NO_MAGICLINKS,
Should this also include RESOLVE_NO_XDEV and/or RESOLVE_NO_SYMLINKS?
I don't think that following symlinks is the intended behavior here,
and crossing filesystems also makes no sense.
I'm expecting the mountpoint to be nosymfollow, but it probably wouldn't
hurt.
I think it is better to not rely on nosymfollow where it isn't necessary.
...
...
...
+	};
+	int r = syscall(SYS_openat2, dirfd, path, &how, sizeof how);
+	if (r == -1)
+		err(EXIT_FAILURE, "openat2 %s", path);
+	return r;
+}
+
+static int resolve_link(int dirfd, const char *path, char **target)
+{
+	int r;
+	struct stat sb;
+	if (fstatat(dirfd, path, &sb, AT_SYMLINK_NOFOLLOW) == -1)
+		err(EXIT_FAILURE, "fstatat %s", path);
+	if (!(*target = malloc(sb.st_size + 1)))
+		err(EXIT_FAILURE, "malloc %zu", sb.st_size + 1);
+	if ((r = readlinkat(dirfd, path, *target, sb.st_size + 1)) <= -1)
+		err(EXIT_FAILURE, "readlinkat %s", path);
+	if (r == sb.st_size + 1)
+		errx(EXIT_FAILURE, "symlink target lengthened");
What about this?
if (r > sb.st_size)
      errx(EXIT_FAILURE, "symlink target lengthened");
That might get rid of the clang static analyzer warning too.
IIRC I tried it and it does not, but I can change it anyway.
...
...
+	(*target)[r] = 0; // NOLINT(clang-analyzer-security.ArrayBound)
+	return openat_beneath(dirfd, *target, O_PATH | O_CLOEXEC | O_DIRECTORY);
+}
Linux will follow symbolic links in openat2() with RESOLVE_BENEATH,
unless RESOLVE_NO_SYMLINKS is set.  It still checks that the resolved
path is underneath the provided dirfd.
As-is, you also have a time-of-check to time-of-use race, though losing
it will just cause the program to fail with an error.
...
+static void write_to(int dirfd, const char *path, const char *text)
+{
+	FILE *file;
+	size_t len = strlen(text);
+	int f = openat(dirfd, path,
+	               O_WRONLY | O_CLOEXEC | O_NOFOLLOW | O_CREAT, 0644);
+
+	if (f == -1)
+		err(EXIT_FAILURE, "openat %s", path);
+	if (!(file = fdopen(f, "w")))
+		err(EXIT_FAILURE, "fdopen %s", path);
+
+	if (fwrite(text, 1, len, file) != len)
+		err(EXIT_FAILURE, "fwrite");
+	if (fclose(file) == EOF)
+		err(EXIT_FAILURE, "fclose");
+}
I don't think it is necessary to use fwrite() instead of write() here.
Correct me if I'm wrote, but I believe fwrite() doesn't need to be run
in a loop in case of short writes.
I was thinking of EINTR, but this code has no signal handlers so
EINTR can't happen.
...
...
...
+static void set_up_app_dir(int source_commit_dir, int installation_dir,
+                           const char *id, const char *arch,
+                           const char *branch, const char *commit)
+{
+	int app_dir, id_dir, arch_dir, branch_dir;
+
+	app_dir = mkdir_openat(installation_dir, "app", 0755);
+
+	id_dir = mkdir_openat(app_dir, id, 0755);
+	close(app_dir);
+
+	arch_dir = mkdir_openat(id_dir, arch, 0755);
+	close(id_dir);
+
+	branch_dir = mkdir_openat(arch_dir, branch, 0755);
+	close(arch_dir);
+
+	if (mkdirat(branch_dir, commit, 0755) == -1)
+		err(EXIT_FAILURE, "mkdirat %s", commit);
Should there be a check that `commit` does not have control characters?
If anything is interpreting control characters in these paths, that's
their bug.  We can avoid printing them though.
I'm mostly thinking of ls and friends.  I believe `commit` is actually
a hex-encoded or base64-encoded hash, so it's safe to be strict here.

What *is* critical is ensuring that id, arch, branch, and commit
do not have any '/' characters.  This is currently not checked.
except for `arch`.

Stronger validation would be a good idea, and I recommend using the
same code that updates-dir-check uses.  The reason this is okay is
that most of these are not arbitrary strings:

- `arch` is an architecture, of which there are only a few.
- `commit` is a SHA256 hash, and I believe it is always hex-encoded.
- `id` is a D-Bus name, which is a subset of what updates-dir-check
  allows.

The only exception is `branch`, but that can be checked to at least
not have ASCII control characters or '/'.
...
...
...
+	bind_mount(source_commit_dir, "", branch_dir, commit);
+	close(branch_dir);
+}
+
+static int mount_app(int source_installation_dir, int target_installation_dir,
+                     int params_dir, const char *id)
+{
+	char *arch_and_branch, *arch_end, *commit;
+	int app_dir, id_dir, branch_dir, commit_dir;
+
+	app_dir = openat_beneath(source_installation_dir, "app",
+	                         O_PATH | O_CLOEXEC | O_DIRECTORY);
+	id_dir = openat_beneath(app_dir, id, O_PATH | O_CLOEXEC | O_DIRECTORY);
+	close(app_dir);
+
+	branch_dir = resolve_link(id_dir, "current", &arch_and_branch);
+	close(id_dir);
+
+	commit_dir = resolve_link(branch_dir, "active", &commit);
+	close(branch_dir);
+
+	if (!(arch_end = strchr(arch_and_branch, '/')))
+		errx(EXIT_FAILURE, "unexpected current format");
+	*arch_end = 0;
+
+	set_up_app_dir(commit_dir, target_installation_dir, id,
+	               arch_and_branch, arch_end + 1, commit);
+	write_to(params_dir, "commit", commit);
+	write_to(params_dir, "arch", arch_and_branch);
+	write_to(params_dir, "branch", arch_end + 1);
+
+	free(arch_and_branch);
+	free(commit);
+	return commit_dir;
+}
+
+static void set_up_runtime_dir(int source_commit_dir, int installation_dir,
+                               const char *ref, const char *commit)
+{
+	int runtime_dir, branch_dir;
+
+	runtime_dir = mkdir_openat(installation_dir, "runtime", 0755);
+
+	branch_dir = mkdirs_beneath(runtime_dir, ref);
+	close(runtime_dir);
+
+	if (mkdirat(branch_dir, commit, 0755) == -1)
+		err(EXIT_FAILURE, "mkdirat %s", commit);
+	bind_mount(source_commit_dir, "", branch_dir, commit);
+	close(branch_dir);
+}
+
+static void mount_runtime(int source_installation_dir,
+                          int target_installation_dir,
+                          int params_dir, int app_commit_dir)
+{
+	char runtime[256], *commit;
+	int runtime_dir, branch_dir, commit_dir;
+	int metadata = openat_beneath(app_commit_dir, "metadata",
+	                              O_RDONLY | O_CLOEXEC);
Use O_PATH in case this is something weird like a named pipe.
After checking that it is a regular file, reopen via /proc/self/fd.
Hate that there's not a better way, but good idea.
The safest approach is:

- Use fsopen() to create a completely private /proc.
- Use subset=pid to avoid junk that isn't needed.
- Use /proc/thread-self/fd/SOMETHING.

This is used by <https://github.com/cyphar/libprocrs>, which has to
work in untrusted containers.  This code won't be doing that, though.
...
...
...
+	extract_runtime(metadata, runtime);
+
+	runtime_dir = openat_beneath(source_installation_dir, "runtime",
+	                             O_PATH | O_CLOEXEC | O_DIRECTORY);
+
+	branch_dir = openat_beneath(runtime_dir, runtime,
+	                            O_PATH | O_CLOEXEC | O_DIRECTORY);
+	close(runtime_dir);
+
+	commit_dir = resolve_link(branch_dir, "active", &commit);
+	close(branch_dir);
+
+	set_up_runtime_dir(commit_dir, target_installation_dir,
+	                   runtime, commit);
+	write_to(params_dir, "runtime-commit", commit);
+
+	free(commit);
+	close(commit_dir);
+}
+
+static void set_up_repo(int target_installation_dir)
+{
+	int config;
+
+	if (mkdirat(target_installation_dir, "repo", 0755) == -1)
+		err(EXIT_FAILURE, "mkdir repo");
+	if (mkdirat(target_installation_dir, "repo/objects", 0755) == -1)
+		err(EXIT_FAILURE, "mkdir repo/objects");
+	if (mkdirat(target_installation_dir, "repo/tmp", 0775) == -1)
+		err(EXIT_FAILURE, "mkdir repo/tmp");
+	if (mkdirat(target_installation_dir, "repo/tmp/cache", 0775) == -1)
+		err(EXIT_FAILURE, "mkdir repo/tmp/cache");
Should this use mkdir_openat()?  Alternative, you can use openat()
on the just-created subdirectory and use that FD for subsequent operations,
including using mkdir_openat() for tmp/cache.
I don't think there's any race to worry about the way it is, so using
mkdir_openat() would just result in lots of unnecessary syscalls and
extra code to close all the extra file descriptors.
Okay, now I see what is going on:

- The "app" subdirectory is writable by whoever provided the flatpak.
  It is read-only to the guest running the flatpak.  The latter is
  enforced by the recursive mount_setattr in main().

- The other subdirectories are read-only to the guest running the
  flatpak.  This is enforced by virtiofsd running in a mount namespace
  where the root of its shared directory is a read-only bind mount.
  This is documented as "The VM should not be able to write directly
  into a tmpfs", but it's actually a critical security invariant that
  at least my update code is entirely dependent on.
...
...
...
+	if ((config = openat(target_installation_dir, "repo/config",
+	                     O_WRONLY | O_CLOEXEC | O_NOFOLLOW | O_CREAT,
+	                     0644)) == -1)
+		err(EXIT_FAILURE, "openat repo/config");
Should this use O_EXCL?
Can't hurt.
...
...
+	bind_mount(AT_FDCWD, CONFIG_PATH, config, "");
+
+	close(config);
+}
+
+int main(int, char **argv)
+{
+	char *installation_path, *id;
+	int params_dir, source_installation_dir, target_installation_dir,
+	    app_commit_dir;
+	struct mount_attr attr = {
+		.attr_clr = MOUNT_ATTR_NOSYMFOLLOW,
+		.attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NODEV,
+	};
+
+	if (!(installation_path = *++argv))
+		errx(EXIT_FAILURE, "missing installation path");
+	if (!(id = *++argv))
+		errx(EXIT_FAILURE, "missing app ID");
+
+	if ((source_installation_dir = open(installation_path,
+	                                    O_PATH | O_CLOEXEC | O_DIRECTORY)) == -1)
+		err(EXIT_FAILURE, "open %s", installation_path);
+
+	params_dir = mkdir_openat(AT_FDCWD, "params", 0755);
+	write_to(params_dir, "id", id);
+
+	if (mkdir("flatpak", 0755) == -1)
+		err(EXIT_FAILURE, "mkdir flatpak");
+	if ((target_installation_dir = syscall(SYS_open_tree, AT_FDCWD,
+	                                       "flatpak",
+	                                       AT_EMPTY_PATH | OPEN_TREE_CLONE |
+	                                       OPEN_TREE_CLOEXEC |
+	                                       AT_RECURSIVE)) == -1)
+		err(EXIT_FAILURE, "open_tree flatpak");
I don't know if the "flatpak" directory is visible in the filesystem
at this point.  However, it is at least not visible to untrusted code.
Therefore, I recommend performing all the calls to mkdir() and write_to()
before anything is bind-mounted into this tree.
It is visible in the filesystem.  If the stuff we're bind mounting is
able to mess with the files and directories we're creating though, isn't
it game over regardless?  It's very much not supposed to be able to do
that since it's mounted in a subdirectory.
I forgot that this operates on a part of the filesystem the guest
doesn't have write access to.
This code is very subtle and could use a lot of comments.
In particular, it should be very clear at each point which file
descriptors are to secure directories (not writable by guests)
and which ones are to insecure directories (writable by guest).
One could use the C type system to help with this by using different
wrapper structs for each of them.
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)