Last active
August 10, 2023 23:14
-
-
Save snarkmaster/5ca6b668499bf2c9010fd68227d64887 to your computer and use it in GitHub Desktop.
Linux demo: Creating irrevocably read-only bind mounts (`MNT_LOCK_READONLY`) via user namespace moves
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This shows one of two possible methods (as of Linux v6.4.9) for seting up a | |
MNT_LOCK_READONLY mount, which is a mount that cannot be remounted | |
read-write even by a fully privileged super-user. The same principle applies | |
to locking NODEV, NOSUID, and NOEXEC [1]. | |
Hopefully, at some point the Linux kernel will support setting locked bits | |
via `mount_setattr` -- see [2]. | |
This works as follows: | |
- As seen in [1], whether a mount is locked is a bit on `struct mount`. Once | |
locked, these bits cannot be cleared [3]. | |
- When we create the Helper child | |
Mount locking is a standard part of the user NS security model, per `man | |
mount_namespaces`: | |
> A mount namespace has an owner user namespace. A mount namespace whose | |
> owner user namespace is different from the owner user namespace of its | |
> parent mount namespace is considered a less privileged mount namespace. | |
> | |
> The mount(2) flags MS_RDONLY, MS_NOSUID, MS_NOEXEC, and the "atime" flags | |
> (MS_NOATIME, MS_NODIRATIME, MS_RELATIME) settings become locked when | |
> propagated from a more privileged to a less privileged mount namespace, | |
> and may not be changed in the less privileged mount namespace. | |
The only innovation here is that after we generate a locked mount (by | |
creating a new user & mount NS), we ship it back to the originating | |
namespace, showing that mounts remain locked even once they return to the | |
outermost user NS. | |
This is NOT production code. Some problems deliberately left for brevity: | |
- doing work (including `perror` without flushing!) in post-fork -- risky, | |
incompatible with threads & sanitizers | |
- read / write without EINTR / retries, | |
- `SCM_RIGHTS` would be cleaner than `pidfd_getfd`, | |
- failing to use AT_SYMLINK_NOFOLLOW / AT_RECURSIVE as appropriate, | |
- not cleaning up stray mounts, especially on error paths, | |
- not closing FDs. | |
Demo: | |
gcc -Wall -o locked_mount_via_newns locked_mount_via_newns.cpp | |
sudo env PREV_PWD=$(pwd) unshare -m # `unshare` not required, can be `bash` | |
cd $(mktemp -d) | |
mkdir src dest temp | |
"$PREV_PWD"/locked_mount_via_newns src dest temp | |
$ touch src/foo | |
$ ls {src,dest,temp}/* | |
dest/foo src/foo temp/foo | |
$ touch dest/bar temp/bar | |
touch: cannot touch 'dest/bar': Read-only file system | |
touch: cannot touch 'temp/bar': Read-only file system | |
$ mount -o rw,remount temp | |
$ mount -o rw,remount dest | |
mount: /tmp/tmp.GnXKyj5evn/dest: permission denied. | |
References: | |
[1] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2069 | |
[2] | |
https://lore.kernel.org/linux-fsdevel/[email protected] | |
[3] https://elixir.bootlin.com/linux/v6.4.9/source/fs/namespace.c#L2561 | |
*/ | |
#include <assert.h> | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <linux/types.h> | |
#include <sched.h> | |
#include <signal.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <sys/mount.h> | |
#include <sys/syscall.h> | |
#include <sys/types.h> | |
#include <sys/wait.h> | |
#include <unistd.h> | |
namespace { | |
int pidfd_open(pid_t pid, unsigned int flags) noexcept { | |
return ::syscall(SYS_pidfd_open, pid, flags); | |
} | |
int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) noexcept { | |
return ::syscall(SYS_pidfd_getfd, pidfd, targetfd, flags); | |
} | |
} // namespace | |
#define ERR_EXIT(x) ([&](){ auto ret = (x); if (ret == -1) { perror(#x); _exit(-errno); } return ret; }()) | |
int main(int argc, char** argv) { | |
if (argc < 4) { | |
fprintf(stderr, "Usage: %s src dest temp\n", argv[0]); | |
return 1; | |
} | |
const char* src = argv[1]; | |
const char* dest = argv[2]; | |
const char* temp = argv[3]; | |
// Set up an read-only bind-mount of `src` and attach it to `temp` | |
auto treeFd = ERR_EXIT(open_tree(AT_FDCWD, src, OPEN_TREE_CLONE)); | |
struct mount_attr attr { .attr_set = MOUNT_ATTR_RDONLY }; | |
ERR_EXIT(mount_setattr( | |
treeFd, "", AT_EMPTY_PATH, &attr, sizeof(struct mount_attr))); | |
ERR_EXIT(move_mount( | |
treeFd, "", AT_FDCWD, temp, MOVE_MOUNT_F_EMPTY_PATH)); | |
// `fork` child writes `kSigil` to the pipe once its FD 0 is a locked tree | |
const char kSigil[] = "DoNeSiGiL"; | |
int pipeFds[2]; | |
ERR_EXIT(pipe2(pipeFds, O_CLOEXEC)); | |
// The child is the "Helper" described in the file docblock. | |
pid_t child = ERR_EXIT(fork()); | |
if (child == 0) { | |
close(pipeFds[0]); // Below, we use `EPIPE`-on-write as a signal to exit. | |
// CRUCIAL: As we clone the mount namespace, all pre-existing mounts | |
// (including `temp` above)` become locked. | |
ERR_EXIT(unshare(CLONE_NEWUSER | CLONE_NEWNS)); | |
auto lockedTreeFd = ERR_EXIT(open_tree(AT_FDCWD, temp, OPEN_TREE_CLONE)); | |
// Put the unattached mount at FD 0, and tell parent to `pidfd_getfd` it. | |
ERR_EXIT(dup2(lockedTreeFd, 0)); | |
if (write(pipeFds[1], kSigil, sizeof(kSigil)) != sizeof(kSigil)) { | |
perror("write kSigil"); | |
_exit(1); | |
} | |
// Parent will close the pipe after grabbing FD 0. Waiting for EPIPE | |
// guarantees the child exits even if the parent crashes. | |
while (true) { | |
sleep(1); | |
if (-1 == write(pipeFds[1], kSigil, 1) && errno == EPIPE) { | |
break; | |
} | |
} | |
} | |
// Wait for child to prepare mount FD | |
char readSigil[sizeof(kSigil)]; | |
if (read(pipeFds[0], readSigil, sizeof(readSigil)) != sizeof(kSigil) || | |
0 != strncmp(kSigil, readSigil, sizeof(kSigil))) { | |
perror("read kSigil"); | |
return 1; | |
} | |
// Retrieve child FD 0, which is the new, now-locked, tree | |
auto pidFd = ERR_EXIT(pidfd_open(child, 0)); | |
auto lockedFd = ERR_EXIT(pidfd_getfd(pidFd, 0, 0)); | |
// Reap child | |
close(pipeFds[0]); | |
kill(child, SIGKILL); | |
waitpid(child, nullptr, 0); | |
ERR_EXIT(move_mount(lockedFd, "", AT_FDCWD, dest, MOVE_MOUNT_F_EMPTY_PATH)); | |
struct mount_attr clearRdOnly = { .attr_clr = MOUNT_ATTR_RDONLY }; | |
assert(mount_setattr(AT_FDCWD, dest, 0, &clearRdOnly, sizeof(clearRdOnly)) == -1); | |
assert(errno == EPERM); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Revision 1 works, but has a wrong explanation of how it works. Ignore!