-
-
Save marcan/a2eafd605d3d6ac76eb10a7c64f736c3 to your computer and use it in GitHub Desktop.
#!/boot/bzImage | |
# Linux kernel userspace initialization code, translated to bash | |
# (Minus floppy disk handling, because seriously, it's 2017.) | |
# Not 100% accurate, but gives you a good idea of how kernel init works | |
# GPLv2, Copyright 2017 Hector Martin <[email protected]> | |
# Based on Linux 4.10-rc2. | |
# Note: pretend chroot is a builtin and affects the current process | |
# Note: kernel actually uses major/minor device numbers instead of device name | |
# strings in a few places, but I simplified it by using strings | |
# everywhere even though that is not completely accurate. | |
panic() { | |
echo "$*" | |
while true; do | |
sleep 1 | |
done | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L363 | |
do_mount_root() { | |
mount -t $2 "$1" /root $rootflags || return $? | |
cd /root | |
echo "VFS: Mounted root ($2 filesystem) on device $major:$minor" | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L381 | |
mount_block_root() { | |
if [ -z $rootfstype ]; then | |
rootfstype=$built_in_filesystem_types | |
fi | |
for fs in ${rootfstype//,/ }; do | |
do_mount_root $1 $fs | |
ret=$? | |
case $ret in | |
13|22) # EACCES or EINVAL | |
;; | |
*) | |
echo "VFS: Cannot open root device \"$root_device_name\" or $1: error $ret" | |
echo "Please append a correct \"root=\" boot option; here are the available partitions:" | |
printk_all_partitions | |
panic "VFS: Unable to mount root fs on $1" | |
esac | |
done | |
echo "List of all partitions:" | |
printk_all_partitions | |
echo "No filesystem could mount root, tried: ${rootfstype//,/ }" | |
panic "VFS: Unable to mount root fs on $1" | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L512 | |
mount_root() { | |
if [ "$root" = "/dev/nfs" ]; then | |
mount_nfs_root && return | |
echo "VFS: Unable to mount root fs via NFS, trying floppy." | |
root=/dev/fd0 | |
fi | |
if [ "$root" = "/dev/fd0" ]; then | |
# floppy switching nonsense | |
fi | |
# This is really a mknod, as the kernel is working with the device number | |
cp -a "$root" /dev/root || echo "Failed to create /dev/root: $?" | |
mount_block_root /dev/root | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_rd.c#L185 | |
rd_load_image() { | |
# Supports more compression algorithms in practice | |
gzip -d <$1 >/dev/ram || cat $1 >/dev/ram | |
# Bunch of nonsense special casing for floppies skipped | |
# Everyone but S/390 gets a cute spinner here... | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_initrd.c#L119 | |
initrd_load() { | |
mknod /dev/ram b 1 0 | |
if rd_load_image /initrd.image && [ "$root" != "/dev/ram0" ]; then | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_initrd.c#L51 | |
# This is the deprecated "change_root" mechanism; see Documentation/initrd.txt for details. | |
# In this mode, the initrd should contain /linuxrc and it is *not* responsible for mounting the rootfs. | |
rm /initrd.image | |
mknod /dev/root.old b 1 0 | |
# mount initrd on rootfs' /root | |
mount_block_root /dev/root.old | |
mkdir /old | |
cd /old | |
# try loading default modules from initrd | |
load_default_modules | |
( | |
exec </dev/console >&0 2>&0 | |
cd /root | |
mount --move . / | |
chroot . | |
setsid /linuxrc | |
) | |
# move initrd to rootfs' /old | |
mount --move .. . | |
# switch root and cwd back to / of rootfs | |
chroot .. | |
cd / | |
mount_root | |
echo -n "Trying to move old root to /initrd ... " | |
mount --move /old /root/initrd | |
ret=$? | |
if [ $ret = 0 ]; then | |
echo "okay" | |
else | |
if [ $ret = 2 ]; then # ENOENT | |
echo "/initrd does not exit. Ignored." | |
else | |
echo "failed" | |
fi | |
echo "Unmounting old root" | |
umount -l /old | |
echo -n "Trying to free ramdisk memory ... " | |
blockdev --flushbufs /dev/root.old && echo "okay" || echo "failed" | |
if | |
return 0 | |
else | |
# Otherwise, if root=/dev/ram0, this is the "new" "pivot_root" initrd mechanism. | |
# The initrd is just mounted like any other root FS and $init is called in it. | |
# See Documentation/initrd.txt for what the initrd has to do in this case. | |
# Note that this is obsolete too in the more recent initramfs case. | |
rm /initrd.image | |
return 1 | |
fi | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L549 | |
prepare_namespace() { | |
if [ ! -z "$rootdelay" ]; then | |
echo "Waiting $rootdelay sec before mounting root device..." | |
sleep $rootdelay | |
fi | |
wait_for_device_probe # wait for devices | |
md_run_setup # md-raid autoconfig: https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_md.c#L303 | |
if [ ! -z "$root" ]; then | |
root_device_name="$root" | |
case "$root" in | |
mtd*|ubi*) | |
mount_block_root "$root" | |
mount -t devtmpfs devtmpfs dev # only if CONFIG_DEVTMPFS_MOUNT | |
mount --move . / | |
chroot . | |
return | |
;; | |
esac | |
root_device_name="${root##/dev/}" | |
fi | |
if ! initrd_load; then | |
if [ ! -z $root_wait ]; then | |
echo "Waiting for root device $root..." | |
while ! driver_probe_done || [ ! -e $root ]; do | |
sleep 1 | |
done | |
fi | |
mount_root | |
fi | |
mount -t devtmpfs devtmpfs dev # only if CONFIG_DEVTMPFS_MOUNT | |
mount --move . / | |
chroot . | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L608 | |
populate_rootfs() { | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/scripts/gen_initramfs_list.sh#L50 | |
## OR (if initramfs disabled): https://github.com/torvalds/linux/blob/v4.10-rc2/init/noinitramfs.c#L28 | |
# default initramfs | |
cd / | |
mkdir /dev | |
mknod /dev/console c 5 1 | |
mkdir /root | |
# additional kernel built-in initramfs contents (not a real device) | |
cpio -i < /dev/internal_initramfs | |
# note: /dev/initrd isn't a real device but represents the initrd memory | |
# /initrd.image is a real file on rootfs | |
if [ -e /dev/initrd ]; then | |
echo "Trying to unpack rootfs image as initramfs..." | |
# actual kernel code for cpio can deal with compression & concatenation | |
if ! cpio -i < /dev/initrd; then | |
echo "rootfs image is not an initramfs; looks like an initrd" | |
cp /dev/initrd /initrd.image | |
fi | |
free_initrd # gets rid of /dev/initrd: https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L527 | |
# Try loading default modules from initramfs. This gives | |
# us a chance to load before device_initcalls. | |
load_default_modules | |
fi | |
} | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L952 | |
kernel_init() { | |
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L999 | |
early_kernel_init | |
# Note: at this point, as part of basic VFS init, a rootfs (special tmpfs) is mounted at / | |
## this is an initcall, called here: https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L873 | |
## declared here: https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L658 | |
populate_rootfs | |
more_kernel_init | |
# Open the /dev/console on the rootfs, this should never fail | |
exec </dev/console >&0 2>&0 || echo "Warning: unable to open an initial console." | |
# check if there is an early userspace init. If yes, let it do all the work | |
if [ -z "$rdinit" ]; then | |
rdinit=/init | |
fi | |
if [ ! -e "$rdinit" ]; then | |
rdinit= | |
# Mount root, the whole shebang. | |
# Only done if there is *no* $rdinit (/init) in the initramfs! | |
prepare_namespace | |
fi | |
# Ok, we have completed the initial bootup, and | |
# we're essentially up and running. Get rid of the | |
# initmem segments and start the user-mode stuff.. | |
# | |
# rootfs is available now, try loading the public keys | |
# and default modules | |
integrity_load_keys | |
load_default_modules | |
late_kernel_init | |
if [ ! -z "$rdinit" ]; then | |
# If present in the initramfs, $rdinit (/init) is responsible | |
# for *everything*, and this is the modern way of doing things. | |
# To find out what $rdinit has to do in that case, read | |
# Documentation/filesystems/ramfs-rootfs-initramfs.txt | |
exec $rdinit | |
echo "Failed to execute $rdinit (error $?)" | |
fi | |
if [ ! -z "$init" ]; then | |
# This could be the real /sbin/init, or an initrd /sbin/init. | |
exec $init | |
echo "Requested init $init failed (error $?)" | |
fi | |
exec /sbin/init || exec /etc/init || exec /bin/init || exec /bin/sh | |
panic "No working init found. Try passing init= option to kernel. See Linux Documentation/admin-guide/init.rst for guidance." | |
} | |
kernel_init |
btw the chroot functionality you're assuming is actually a thing (called pivotroot
) and used by early inits to mount the real root after running the initrd.
@nonchip not quite. pivot_root
is a separate system call that affects the current mount namespace and all processes sharing it, while chroot
only affects the current process. pivot_root
is usually used in conjunction with chroot
to ensure that the current working directory and root are correctly set. When I write chroot
above I really do mean the good old chroot()
system call. The problem is that it needs to affect the current process (the hypothetical shell, i.e. it needs to be built-in) while the traditional UNIX chroot
command spawns a subprocess/subshell.
See https://github.com/torvalds/linux/blob/v4.10-rc2/fs/namespace.c#L3035 for more details on what exactly pivot_root
does. It's very different from chroot
(and it also only works on initrd/regular mounts, not on rootfs).
Thiy is valuable teaching. Thanks for that!