-
-
Save marcan/a2eafd605d3d6ac76eb10a7c64f736c3 to your computer and use it in GitHub Desktop.
#!/boot/bzImage | |
# Linux kernel userspace initialization code, translated to bash | |
# (Minus floppy disk handling, because seriously, it's 2017.) | |
# Not 100% accurate, but gives you a good idea of how kernel init works | |
# GPLv2, Copyright 2017 Hector Martin <[email protected]> | |
# Based on Linux 4.10-rc2. | |
# Note: pretend chroot is a builtin and affects the current process | |
# Note: kernel actually uses major/minor device numbers instead of device name | |
# strings in a few places, but I simplified it by using strings | |
# everywhere even though that is not completely accurate. | |
panic() { | |
echo "$*" | |
while true; do | |
sleep 1 | |
done | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L363 | |
do_mount_root() { | |
mount -t $2 "$1" /root $rootflags || return $? | |
cd /root | |
echo "VFS: Mounted root ($2 filesystem) on device $major:$minor" | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L381 | |
mount_block_root() { | |
if [ -z $rootfstype ]; then | |
rootfstype=$built_in_filesystem_types | |
fi | |
for fs in ${rootfstype//,/ }; do | |
do_mount_root $1 $fs | |
ret=$? | |
case $ret in | |
13|22) # EACCES or EINVAL | |
;; | |
*) | |
echo "VFS: Cannot open root device \"$root_device_name\" or $1: error $ret" | |
echo "Please append a correct \"root=\" boot option; here are the available partitions:" | |
printk_all_partitions | |
panic "VFS: Unable to mount root fs on $1" | |
esac | |
done | |
echo "List of all partitions:" | |
printk_all_partitions | |
echo "No filesystem could mount root, tried: ${rootfstype//,/ }" | |
panic "VFS: Unable to mount root fs on $1" | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L512 | |
mount_root() { | |
if [ "$root" = "/dev/nfs" ]; then | |
mount_nfs_root && return | |
echo "VFS: Unable to mount root fs via NFS, trying floppy." | |
root=/dev/fd0 | |
fi | |
if [ "$root" = "/dev/fd0" ]; then | |
# floppy switching nonsense | |
fi | |
# This is really a mknod, as the kernel is working with the device number | |
cp -a "$root" /dev/root || echo "Failed to create /dev/root: $?" | |
mount_block_root /dev/root | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_rd.c#L185 | |
rd_load_image() { | |
# Supports more compression algorithms in practice | |
gzip -d <$1 >/dev/ram || cat $1 >/dev/ram | |
# Bunch of nonsense special casing for floppies skipped | |
# Everyone but S/390 gets a cute spinner here... | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_initrd.c#L119 | |
initrd_load() { | |
mknod /dev/ram b 1 0 | |
if rd_load_image /initrd.image && [ "$root" != "/dev/ram0" ]; then | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_initrd.c#L51 | |
# This is the deprecated "change_root" mechanism; see Documentation/initrd.txt for details. | |
# In this mode, the initrd should contain /linuxrc and it is *not* responsible for mounting the rootfs. | |
rm /initrd.image | |
mknod /dev/root.old b 1 0 | |
# mount initrd on rootfs' /root | |
mount_block_root /dev/root.old | |
mkdir /old | |
cd /old | |
# try loading default modules from initrd | |
load_default_modules | |
( | |
exec </dev/console >&0 2>&0 | |
cd /root | |
mount --move . / | |
chroot . | |
setsid /linuxrc | |
) | |
# move initrd to rootfs' /old | |
mount --move .. . | |
# switch root and cwd back to / of rootfs | |
chroot .. | |
cd / | |
mount_root | |
echo -n "Trying to move old root to /initrd ... " | |
mount --move /old /root/initrd | |
ret=$? | |
if [ $ret = 0 ]; then | |
echo "okay" | |
else | |
if [ $ret = 2 ]; then # ENOENT | |
echo "/initrd does not exit. Ignored." | |
else | |
echo "failed" | |
fi | |
echo "Unmounting old root" | |
umount -l /old | |
echo -n "Trying to free ramdisk memory ... " | |
blockdev --flushbufs /dev/root.old && echo "okay" || echo "failed" | |
if | |
return 0 | |
else | |
# Otherwise, if root=/dev/ram0, this is the "new" "pivot_root" initrd mechanism. | |
# The initrd is just mounted like any other root FS and $init is called in it. | |
# See Documentation/initrd.txt for what the initrd has to do in this case. | |
# Note that this is obsolete too in the more recent initramfs case. | |
rm /initrd.image | |
return 1 | |
fi | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L549 | |
prepare_namespace() { | |
if [ ! -z "$rootdelay" ]; then | |
echo "Waiting $rootdelay sec before mounting root device..." | |
sleep $rootdelay | |
fi | |
wait_for_device_probe # wait for devices | |
md_run_setup # md-raid autoconfig: https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_md.c#L303 | |
if [ ! -z "$root" ]; then | |
root_device_name="$root" | |
case "$root" in | |
mtd*|ubi*) | |
mount_block_root "$root" | |
mount -t devtmpfs devtmpfs dev # only if CONFIG_DEVTMPFS_MOUNT | |
mount --move . / | |
chroot . | |
return | |
;; | |
esac | |
root_device_name="${root##/dev/}" | |
fi | |
if ! initrd_load; then | |
if [ ! -z $root_wait ]; then | |
echo "Waiting for root device $root..." | |
while ! driver_probe_done || [ ! -e $root ]; do | |
sleep 1 | |
done | |
fi | |
mount_root | |
fi | |
mount -t devtmpfs devtmpfs dev # only if CONFIG_DEVTMPFS_MOUNT | |
mount --move . / | |
chroot . | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L608 | |
populate_rootfs() { | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/scripts/gen_initramfs_list.sh#L50 | |
## OR (if initramfs disabled): https://github.com/torvalds/linux/blob/v4.10-rc2/init/noinitramfs.c#L28 | |
# default initramfs | |
cd / | |
mkdir /dev | |
mknod /dev/console c 5 1 | |
mkdir /root | |
# additional kernel built-in initramfs contents (not a real device) | |
cpio -i < /dev/internal_initramfs | |
# note: /dev/initrd isn't a real device but represents the initrd memory | |
# /initrd.image is a real file on rootfs | |
if [ -e /dev/initrd ]; then | |
echo "Trying to unpack rootfs image as initramfs..." | |
# actual kernel code for cpio can deal with compression & concatenation | |
if ! cpio -i < /dev/initrd; then | |
echo "rootfs image is not an initramfs; looks like an initrd" | |
cp /dev/initrd /initrd.image | |
fi | |
free_initrd # gets rid of /dev/initrd: https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L527 | |
# Try loading default modules from initramfs. This gives | |
# us a chance to load before device_initcalls. | |
load_default_modules | |
fi | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L952 | |
kernel_init() { | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L999 | |
early_kernel_init | |
# Note: at this point, as part of basic VFS init, a rootfs (special tmpfs) is mounted at / | |
## this is an initcall, called here: https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L873 | |
## declared here: https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L658 | |
populate_rootfs | |
more_kernel_init | |
# Open the /dev/console on the rootfs, this should never fail | |
exec </dev/console >&0 2>&0 || echo "Warning: unable to open an initial console." | |
# check if there is an early userspace init. If yes, let it do all the work | |
if [ -z "$rdinit" ]; then | |
rdinit=/init | |
fi | |
if [ ! -e "$rdinit" ]; then | |
rdinit= | |
# Mount root, the whole shebang. | |
# Only done if there is *no* $rdinit (/init) in the initramfs! | |
prepare_namespace | |
fi | |
# Ok, we have completed the initial bootup, and | |
# we're essentially up and running. Get rid of the | |
# initmem segments and start the user-mode stuff.. | |
# | |
# rootfs is available now, try loading the public keys | |
# and default modules | |
integrity_load_keys | |
load_default_modules | |
late_kernel_init | |
if [ ! -z "$rdinit" ]; then | |
# If present in the initramfs, $rdinit (/init) is responsible | |
# for *everything*, and this is the modern way of doing things. | |
# To find out what $rdinit has to do in that case, read | |
# Documentation/filesystems/ramfs-rootfs-initramfs.txt | |
exec $rdinit | |
echo "Failed to execute $rdinit (error $?)" | |
fi | |
if [ ! -z "$init" ]; then | |
# This could be the real /sbin/init, or an initrd /sbin/init. | |
exec $init | |
echo "Requested init $init failed (error $?)" | |
fi | |
exec /sbin/init || exec /etc/init || exec /bin/init || exec /bin/sh | |
panic "No working init found. Try passing init= option to kernel. See Linux Documentation/admin-guide/init.rst for guidance." | |
} | |
kernel_init |
@copumpkin good point, I added some links back to the functions :)
Thiy is valuable teaching. Thanks for that!
btw the chroot functionality you're assuming is actually a thing (called pivotroot
) and used by early inits to mount the real root after running the initrd.
@nonchip not quite. pivot_root
is a separate system call that affects the current mount namespace and all processes sharing it, while chroot
only affects the current process. pivot_root
is usually used in conjunction with chroot
to ensure that the current working directory and root are correctly set. When I write chroot
above I really do mean the good old chroot()
system call. The problem is that it needs to affect the current process (the hypothetical shell, i.e. it needs to be built-in) while the traditional UNIX chroot
command spawns a subprocess/subshell.
See https://github.com/torvalds/linux/blob/v4.10-rc2/fs/namespace.c#L3035 for more details on what exactly pivot_root
does. It's very different from chroot
(and it also only works on initrd/regular mounts, not on rootfs).
Nice! It might be interesting to link back from comments here to relevant parts of the actual source, so people can follow the correspondence more directly. Might also be a lot of work though 😄