Skip to content

Instantly share code, notes, and snippets.

@ilammy
Created April 5, 2015 18:31
Show Gist options
  • Save ilammy/f39b7366f9dd2f15479d to your computer and use it in GitHub Desktop.
Save ilammy/f39b7366f9dd2f15479d to your computer and use it in GitHub Desktop.
Linux kernel system call table hooking
#include <asm/uaccess.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/version.h>
#include "locate_sct.h"
#include "ttgl.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("ilammy <[email protected]>");
MODULE_DESCRIPTION("Locates and patches the system call table to hook execve() "
"and install additional customizable restrictions on the "
"processes that can be launched from userland.");
#if 0
#if !(defined(ARCH_X86) || defined(ARCH_X64))
# error "Only x86(_64) kernel architectures are supported"
#endif
#if LINUX_KERNEL_VERSION != KERNEL_VERSION(3,2,0)
# error "Only Linux kernel 3.2.0 is supported"
#endif
#endif
static
void hex_dump(unsigned char *bytes, size_t count)
{
size_t i;
printk(KERN_INFO "Dumping %zu bytes at %p:", count, bytes);
for (i = 0; i < count; i++)
{
if (i % 16 == 0)
{
printk("\n ");
}
printk("%02X ", bytes[i]);
}
printk("\n");
}
long hijacked_sys_execve(const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp)
{
char buffer[256];
strncpy_from_user(buffer, filename, 256);
printk(KERN_INFO "hijacked_sys_execve: i see what you did there: %s\n",
buffer);
return sys_execve(filename, argv, envp);
}
static
int print_sct_1(struct gl_region regions[], size_t region_count, void* arg)
{
size_t i;
unsigned long* sys_call_table = regions[0].writeable;
for (i = 0; i < 256; i++)
if ((void*) sys_call_table[i] == (void*) sys_execve)
sys_call_table[i] = (unsigned long*) hijacked_sys_execve;
return 0;
}
static
int print_sct_2(struct gl_region regions[], size_t region_count, void* arg)
{
size_t i;
unsigned long* sys_call_table = regions[0].writeable;
for (i = 0; i < 256; i++)
if ((void*) sys_call_table[i] == (void*) hijacked_sys_execve)
sys_call_table[i] = (unsigned long*) sys_execve;
return 0;
}
static
int __init afw_init(void)
{
struct gl_region sys_call_table;
printk(KERN_INFO "init_module()\n");
printk(KERN_INFO "located sys_call_table: %p\n"
"located ia32_sys_call_table: %p\n",
afw_locate_sys_call_table(),
afw_locate_ia32_sys_call_table());
sys_call_table = (struct gl_region) {
.source = afw_locate_sys_call_table(),
.length = 256 * sizeof(unsigned long)
};
afw_do_with_write_permissions(print_sct_1, &sys_call_table, 1, NULL);
return 0;
}
static
void __exit afw_exit(void)
{
struct gl_region sys_call_table;
printk(KERN_INFO "exit_module()\n");
sys_call_table = (struct gl_region) {
.source = afw_locate_sys_call_table(),
.length = 256 * sizeof(unsigned long)
};
afw_do_with_write_permissions(print_sct_2, &sys_call_table, 1, NULL);
}
module_init(afw_init);
module_exit(afw_exit);
#include "locate_sct.h"
#include <asm/desc.h>
#include <asm/desc_defs.h>
#include <asm/irq_vectors.h>
#include <asm/msr.h>
#include <asm/msr-index.h>
/*
** We need to find the address of the sys_call_table. Since 2.6 kernels this
** symbol is no longer exported, so there is no easy way to find it. One way
** is to use "cat /boot/System.map-`uname -r` | grep sys_call_table". However,
** it may be absent or invalid, and just reading files is not fun. So we're
** going the brute-force way. Introspection! The system call handler should
** bo somewhere in memory and it obviosly uses sys_call_table, so if we
** analyze its code, we can get the address we want.
**
** Actually, on 64-bit x86 systems there are two tables: sys_call_table with
** 64-bit handlers and ia32_sys_call_table with 32-bit handlers for 32-bit
** emulation mode. We need to locate them both.
**
** Also, Linux has two mechanisms for handling system calls on x86 systems:
** the legacy one that uses "int $0x80" to get into ring 0, and more modern
** one which uses special "sysenter"/"syscall" instructions for this. Also
** there is vsyscall mechanism for fast user-level system calls, but it is
** not covered here.
**
** Legacy mechanism uses interrupt traps to obtain control. It is initialized
** by the function trap_init() which can be found in arch/x86/kernel/traps.c.
** The interrupt handler for 0x80 always contains a 32-bit handler.
**
** Modern handler uses model-specific registers (MSRs) to register itself as
** a handler for "sysenter"/"syscall" instructions. It is initialized by the
** function syscall_init() from arch/x86/kernel/cpu/common.c. Native handler
** system_call is loaded to the LSTAR MSR, and IA-32 emulated ones are loaded
** to IA32_SYSENTER_EIP and CSTAR MSRs by the function syscall32_cpu_init()
** from arch/x86/vdso/vdso32-setup.c.
**
** So, to sum it up, to get the 32-bit syscall handler we consult the 0x80
** interrupt handler in the interrupt descriptor table. To get the 64-bit
** syscall handler we consult the LSTAR MSR.
*/
static inline
u8* get_32bit_system_call_handler(void)
{
struct desc_ptr interrupt_descriptor_table;
gate_desc* interrupt_gates;
store_idt(&interrupt_descriptor_table);
interrupt_gates = (gate_desc*) interrupt_descriptor_table.address;
return (u8*) gate_offset(interrupt_gates[IA32_SYSCALL_VECTOR]);
}
static inline
u8* get_64bit_system_call_handler(void)
{
u64 system_call_entry;
rdmsrl(MSR_LSTAR, system_call_entry);
return (u8*) system_call_entry;
}
/*
** Previous functions return pointers to system call handlers. Native system
** call handler is named system_call, its 32-bit implementation is located in
** arch/x86/kernel/entry_32.S and 64-bit one is in arch/x86/kernel/entry_64.S.
** IA-32 emulation handlers are implemented in arch/x86/ia32/ia32entry.S, they
** are named ia32_syscall, ia32_sysenter_target, and ia32_cstar_target.
**
** We are interested in this:
**
** call *sys_call_table(,%rax,8)
** movq %rax,RAX-ARGOFFSET(%rsp)
**
** or this:
**
** call *sys_call_table(,%eax,4)
** movl %eax,PT_EAX(%esp)
**
** These snippets do the actual system call and store its return value.
** sys_call_table is 32-bit offset, should be expanded to 64-bit if necessary.
** RAX, ARGOFFSET, and PT_EAX are all macros that govern calling conventions,
** they expand into small numbers that fit into one byte.
**
** According to the source, these instructions should be found within the
** first 256 bytes of the handlers. We should look for a call instruction
** followed by a mov instruction. Opcodes are the following:
**
** call disp32(,%eax,4) ==> FF 14 85 -- -- -- --
** movl %eax,disp8(%esp) ==> 89 44 24 --
**
** call disp32(,%rax,8) ==> FF 14 C5 -- -- -- --
** movq %rax,disp8(%rsp) ==> 48 89 44 24 --
*/
static
unsigned long* find_sys_call_table_ref(u8* code)
{
size_t i;
for (i = 0; i < 256; i++)
{
#ifdef CONFIG_X86_64
if (code[i + 0] == 0xFF && code[i + 1] == 0x14 &&
code[i + 2] == 0xC5 && code[i + 7] == 0x48 &&
code[i + 8] == 0x89 && code[i + 9] == 0x44 &&
code[i +10] == 0x24)
{
u32 offset = *((u32*) &code[i + 3]);
return (unsigned long*) (0xFFFFFFFF00000000 | offset);
}
#else
if (code[i + 0] == 0xFF && code[i + 1] == 0x14 &&
code[i + 2] == 0x85 && code[i + 7] == 0x89 &&
code[i + 8] == 0x44 && code[i + 9] == 0x24)
{
u32 offset = *((u32*) &code[i + 3]);
return (unsigned long*) offset;
}
#endif
}
return NULL;
}
/*
** And now, when everything's in place...
*/
unsigned long* afw_locate_sys_call_table(void)
{
#ifdef CONFIG_X86_64
return find_sys_call_table_ref(get_64bit_system_call_handler());
#else
return find_sys_call_table_ref(get_32bit_system_call_handler());
#endif
}
#ifdef CONFIG_IA32_EMULATION
unsigned long* afw_locate_ia32_sys_call_table(void)
{
return find_sys_call_table_ref(get_32bit_system_call_handler());
}
#endif
#ifndef AFW_LOCATE_SCT
#define AFW_LOCATE_SCT
/**
* Locate the native system call table (sys_call_table).
*
* @return
* Returns a pointer to the native system call table,
* or `NULL` in case of failure.
*/
unsigned long* afw_locate_sys_call_table(void);
#ifdef CONFIG_IA32_EMULATION
/**
* Locate the system call table used for IA-32 emulation (ia32_sys_call_table).
*
* @return
* Returns a pointer to the IA-32 emulation system call table,
* or `NULL` in case of failure.
*/
unsigned long* afw_locate_ia32_sys_call_table(void);
#endif
#endif // AFW_LOCATE_SCT
obj-m += afw.o
afw-objs := afw_main.o locate_sct.o ttgl.o
ccflags-y := -std=gnu99 -O2
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
#include "ttgl.h"
#include <asm/page.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/stop_machine.h>
/*
** Having troubles with read-only memory? Fuck this shit, we're in kernel mode!
** To obtain write permissions for a region of kernel memory we can just remap
** it somewhere else with necessary permissions. God bless MMU and paging.
*/
#define base_of_page(x) ((void*)((unsigned long)(x) & PAGE_MASK))
static
int enumerate_pages(void* region, struct page *pages[], size_t page_num)
{
size_t i;
void* page_addr = base_of_page(region);
for (i = 0; i < page_num; i++) {
// explain
if (__module_address((unsigned long) page_addr)) {
pages[i] = vmalloc_to_page(page_addr);
}
else {
pages[i] = virt_to_page(page_addr);
WARN_ON(!PageReserved(pages[i]));
}
if (!pages[i])
return -EFAULT;
page_addr += PAGE_SIZE;
}
return 0;
}
static
void* remap_with_write_permissions(void* region, size_t len)
{
void* writeable_region;
size_t page_num = DIV_ROUND_UP(offset_in_page(region) + len, PAGE_SIZE);
struct page **pages = kmalloc(page_num * sizeof(*pages), GFP_KERNEL);
if (!pages)
goto err;
if (enumerate_pages(region, pages, page_num))
goto err;
writeable_region = vmap(pages, page_num, VM_MAP, PAGE_KERNEL);
if (!writeable_region)
goto err;
kfree(pages);
return writeable_region + offset_in_page(region);
err:
kfree(pages);
return NULL;
}
/*
** One needs to write the unwritable only for monkey-patching the kernel code,
** so it is wise to forbid anybody else to mess with global memory when we're
** doing our evil stuff. That's why stop_machine() is used.
**
** Its interface is not identical to our callback, so we need a thunk to pass
** all the arguments we want.
*/
struct stop_machine_work {
int (*fn)(struct gl_region[], size_t, void*);
struct gl_region *regions;
size_t region_count;
void* args;
};
static
int stop_machine_thunk(void* arg)
{
struct stop_machine_work *work = arg;
return work->fn(work->regions, work->region_count, work->args);
}
int afw_do_with_write_permissions(int (*fn)(struct gl_region[], size_t, void*),
struct gl_region regions[],
size_t region_count,
void* args)
{
size_t i;
int result = 0;
struct stop_machine_work work;
if (!fn)
return -EINVAL;
if (!regions || region_count == 0)
return fn(NULL, 0, args);
for (i = 0; i < region_count; i++) {
regions[i].writeable =
remap_with_write_permissions(regions[i].source,
regions[i].length);
if (!regions[i].writeable) {
size_t j;
for (j = 0; j < i; j++)
vunmap(base_of_page(regions[j].writeable));
return -ENOMEM;
}
}
work = (struct stop_machine_work) {
.fn = fn, .regions = regions,
.region_count = region_count,
.args = args
};
/* Stop the machines, prepare to die! */
result = stop_machine(stop_machine_thunk, &work, 0); // 0?
for (i = 0; i < region_count; i++)
vunmap(base_of_page(regions[i].writeable));
return result;
}
#ifndef AFW_TTGL_H
#define AFW_TTGL_H
#include <stddef.h>
/**
* Region descriptor for `afw_do_with_write_permissions()`.
*
* `source` and `length` should be filled by the caller. `writable` is filled
* by `afw_do_with_write_permissions()`, it will be valid inside the `fn`
* callback (and only there).
*/
struct gl_region {
void* source; //!< read-only region of memory
void* writeable; //!< writeable mapping of the `source` region
size_t length; //!< length of the region in bytes
};
/**
* Execute a function with write permissions for specified memory regions.
*
* If `regions == NULL` or `region_count == 0` then the call is equivalent to
* `fn(NULL, 0, args)`.
*
* `fn` is executed in atomic context, so please no locks inside it.
*
* @param fn
* The function to execute. Receives `regions` with updated `writeable`
* fields, `region_count`, and `args`. Must be non-NULL.
*
* @param regions
* Array of `gl_region` structures which describe the read-only regions
* that should be made writable for `fn`.
*
* @param region_count
* Size of `regions` (in elements).
*
* @param args
* Additional arguments to `fn`.
*
* @return
* On success returns the value `fn` returned.
* `-EINVAL` if `fn == NULL`.
* `-ENOMEM` if failed to obtain write permissions.
*/
int afw_do_with_write_permissions(int (*fn)(struct gl_region[], size_t, void*),
struct gl_region regions[],
size_t region_count,
void* args);
#endif // AFW_TTGL_H
@syzh
Copy link

syzh commented May 30, 2018

i use function find_sys_call_table_ref not get linux kernel 4.4.131 sys call table address. why?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment