Last active
August 10, 2023 22:37
-
-
Save snarkmaster/c2d4765b19e69f1626cafc076e60f8ac to your computer and use it in GitHub Desktop.
Linux demo: Creating irrevocably read-only bind mounts (`MNT_LOCK_READONLY`) via mount propagation into another user NS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This shows one of two possible methods (as of Linux v6.4.9) for seting up a | |
MNT_LOCK_READONLY mount, which is a mount that cannot be remounted | |
read-write even by a fully privileged super-user. The same principle applies | |
to locking NODEV, NOSUID, and NOEXEC [1]. | |
Hopefully, at some point the Linux kernel will support setting locked bits | |
via `mount_setattr` [2], but as of v6.4.9, this appears to be the simplest way. | |
This `attach_recursive_mnt`-based method works best if you can have a | |
persistent daemon, which exchanges with the actual mounting process via | |
AF_UNIX sockets via SCM_RIGHTS. Then, you do NOT need a `clone` per mount. | |
If you're going to spawn a helper process per mount anyway, then the | |
`copy_mnt_ns` method is better, see: | |
https://gist.github.com/snarkmaster/5ca6b668499bf2c9010fd68227d64887 | |
This works as follows: | |
- As seen in [1], whether a mount is locked is a bit on `struct mount`. Once | |
locked, these bits cannot be cleared. | |
- The call chain `move_mount` -> 'do_move_mount' -> 'attach_recursive_mnt' | |
will trigger `lock_mnt_tree` if the mount's user namespace differs from | |
the calling process's user namespace, see [3]. | |
Mount locking is a standard part of the user NS security model, per `man | |
mount_namespaces`: | |
> A mount namespace has an owner user namespace. A mount namespace whose | |
> owner user namespace is different from the owner user namespace of its | |
> parent mount namespace is considered a less privileged mount namespace. | |
> | |
> The mount(2) flags MS_RDONLY, MS_NOSUID, MS_NOEXEC, and the "atime" flags | |
> (MS_NOATIME, MS_NODIRATIME, MS_RELATIME) settings become locked when | |
> propagated from a more privileged to a less privileged mount namespace, | |
> and may not be changed in the less privileged mount namespace. | |
The only innovation here is that we generate a locked mount (via mount | |
propagation), and ship it back to the originating namespace, showing that | |
mounts can be locked even within the outermost user NS. | |
Demo: | |
mkdir src dest | |
touch src/foo | |
g++ -Wall -o locked_mount_via_propagation locked_mount_via_propagation.cpp | |
./locked_mount_via_propagation src dest | |
mount -o rw,remount dest | |
mount: /tmp/tmp.TDKtNTRrOZ/dest: permission denied. | |
This is NOT production code. Some problems deliberately left for brevity: | |
- doing work in post-clone code -- risky / incompatible with threaded code | |
and sanitizers, | |
- not cleaning up stray mounts on error paths, | |
- `fflush` / `perror` in the `clone` code paths, | |
- failing to use AT_SYMLINK_NOFOLLOW / AT_RECURSIVE as appropriate, | |
- not closing FDs or setting CLOEXEC by default. | |
Credit: [email protected] for the clone3 / CLONE_FILES trick + waitid(P_PIDFD) | |
References: | |
[1] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2069 | |
[2] | |
https://lore.kernel.org/linux-fsdevel/[email protected] | |
[3] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2261 | |
*/ | |
#include <assert.h> | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <linux/sched.h> | |
#include <linux/types.h> | |
#include <sched.h> | |
#include <signal.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <sys/mman.h> | |
#include <sys/mount.h> | |
#include <sys/stat.h> | |
#include <sys/syscall.h> | |
#include <sys/types.h> | |
#include <sys/wait.h> | |
#include <unistd.h> | |
#include <functional> | |
#include <vector> | |
// START: Syscall stubs to aid compilation with older userspace | |
namespace { | |
static pid_t sys_clone3(struct clone_args *args, size_t size) { | |
fflush(stdout); | |
fflush(stderr); | |
return syscall(__NR_clone3, args, size); | |
} | |
int sys_open_tree(int dfd, const char *filename, unsigned int flags) { | |
return syscall(__NR_open_tree, dfd, filename, flags); | |
} | |
static inline int sys_mount_setattr( | |
int dfd, const char *path, unsigned int flags, struct mount_attr *attr) { | |
return syscall(__NR_mount_setattr, dfd, path, flags, attr, sizeof(*attr)); | |
} | |
int sys_move_mount( | |
int from_dfd, | |
const char *from_path, | |
int to_dfd, | |
const char *to_path, | |
unsigned int flags) { | |
return syscall(__NR_move_mount, from_dfd, from_path, to_dfd, to_path, flags); | |
} | |
int sys_waitid( | |
int which, | |
pid_t pid, | |
siginfo_t *info, | |
int options, | |
struct rusage *ru) { | |
return syscall(__NR_waitid, which, pid, info, options, ru); | |
} | |
} // namespace | |
// END: Syscall stubs | |
#define ERR_EXIT(x) ([&](){ auto ret = (x); if (ret == -1) { perror(#x); _exit(-errno); } return ret; }()) | |
std::vector<int> cloneChildAndAwaitFds( | |
size_t numFds, | |
std::function<void(int*, size_t)> childFn, | |
__u64 cloneFlags) { | |
std::vector<int> fdsToReplace; | |
for (size_t i = 0; i < numFds; ++i) { | |
fdsToReplace.emplace_back(memfd_create("fd_replaced_by_child", 0)); | |
} | |
int childPidfd = 0; | |
struct clone_args cloneArgs = { | |
.flags = CLONE_PIDFD | CLONE_FILES | cloneFlags, | |
.pidfd = (__u64)&childPidfd, | |
.exit_signal = SIGCHLD, | |
}; | |
if (0 == ERR_EXIT(sys_clone3(&cloneArgs, sizeof(cloneArgs)))) { | |
int outFds[numFds]; | |
childFn(outFds, numFds); | |
for (size_t i = 0; i < numFds; ++i) { | |
ERR_EXIT(dup2(outFds[i], fdsToReplace[i])); | |
} | |
_exit(0); | |
} | |
siginfo_t info = {}; | |
ERR_EXIT(sys_waitid(P_PIDFD, childPidfd, &info, WEXITED, NULL)); | |
assert(WIFEXITED(info.si_status)); | |
assert(WEXITSTATUS(info.si_status) == 0); | |
return fdsToReplace; | |
} | |
int main(int argc, char** argv) { | |
if (argc < 3) { | |
fprintf(stderr, "Usage: %s src dest\n", argv[0]); | |
return 1; | |
} | |
const char* src = argv[1]; | |
const char* dest = argv[2]; | |
char tempDir[] = "/tmp/mnt_prop_tunnel.XXXXXX"; | |
if (mkdtemp(tempDir) == nullptr) { | |
perror("mkdtemp"); | |
_exit(1); | |
} | |
auto origTreeFd = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, OPEN_TREE_CLONE)); | |
{ | |
struct mount_attr attr { .propagation = MS_SLAVE }; | |
ERR_EXIT(sys_mount_setattr(origTreeFd, "", AT_EMPTY_PATH, &attr)); | |
ERR_EXIT(sys_move_mount( | |
origTreeFd, "", AT_FDCWD, tempDir, MOVE_MOUNT_F_EMPTY_PATH)); | |
} | |
auto helperOutFds = cloneChildAndAwaitFds( | |
2, | |
[&](int* outFds, size_t numFds) { | |
if (numFds != 2) { _exit(1); } | |
outFds[0] = ERR_EXIT(sys_open_tree(AT_FDCWD, tempDir, 0)); | |
outFds[1] = open("/proc/self/ns/mnt", O_RDONLY); | |
chdir(tempDir); | |
mkdir("mountpoint", 0777); | |
}, | |
CLONE_NEWUSER | CLONE_NEWNS); | |
{ | |
auto srcFd = ERR_EXIT(sys_open_tree(AT_FDCWD, src, OPEN_TREE_CLONE)); | |
struct mount_attr attr = {.attr_set = MOUNT_ATTR_RDONLY}; | |
ERR_EXIT(sys_mount_setattr(srcFd, "", AT_EMPTY_PATH, &attr)); | |
ERR_EXIT(sys_move_mount( | |
srcFd, "", origTreeFd, "mountpoint", MOVE_MOUNT_F_EMPTY_PATH)); | |
} | |
auto lockedDestFdVec = cloneChildAndAwaitFds( | |
1, | |
[&](int* outFds, size_t numFds) { | |
if (numFds != 1) { _exit(1); } | |
ERR_EXIT(setns(helperOutFds[1], CLONE_NEWNS)); | |
outFds[0] = ERR_EXIT(sys_open_tree( | |
helperOutFds[0], "mountpoint", OPEN_TREE_CLONE)); | |
}, | |
0); | |
ERR_EXIT(sys_move_mount( | |
lockedDestFdVec[0], "", AT_FDCWD, dest, MOVE_MOUNT_F_EMPTY_PATH)); | |
ERR_EXIT(umount2(tempDir, MNT_DETACH)); | |
struct mount_attr clearRdOnly = {.attr_clr = MOUNT_ATTR_RDONLY}; | |
assert(sys_mount_setattr(AT_FDCWD, dest, 0, &clearRdOnly) == -1); | |
assert(errno == EPERM); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment