Skip to content

Instantly share code, notes, and snippets.

@teknoraver
Last active November 13, 2024 15:55
Show Gist options
  • Save teknoraver/ed341c5506027c7cdda9e759fdd30c21 to your computer and use it in GitHub Desktop.
Save teknoraver/ed341c5506027c7cdda9e759fdd30c21 to your computer and use it in GitHub Desktop.
*.o
sysctl_monitor
vmlinux.h
*.skel.h
LDLIBS += -lbpf
CFLAGS += -O2 -pipe -g -Wall
all:: sysctl_monitor
vmlinux.h:
bpftool btf dump file /sys/kernel/btf/vmlinux format c > $@
sysctl_monitor_bpf.skel.h: sysctl_monitor.bpf.o
bpftool gen skeleton $< > $@
sysctl_monitor.bpf.o: sysctl_monitor.bpf.c vmlinux.h
clang $(CFLAGS) -target bpf -c $< -o $@
sysctl_monitor: sysctl_monitor.c sysctl_monitor_bpf.skel.h
clean::
$(RM) *.o sysctl_monitor *.skel.h vmlinux.h
#pragma once
#ifndef TASK_COMM_LEN
#define TASK_COMM_LEN 16
#endif
/* It would be nice to size these members to bigger values, but the stack
* in BPF programs is limited to 512 bytes, and allocating bigger structures
* leads to this compile time error:
* error: Looks like the BPF stack limit is exceeded.
* Please move large on stack variables into BPF per-cpu array map.
* For non-kernel uses, the stack can be increased using -mllvm -bpf-stack-size. */
struct sysctl_write_event {
/* Used to track changes in the struct layout */
int version;
/* Error code returned to userspace to handle eventual failures. */
int errorcode;
/* The PID of the process which is writing the sysctl. */
pid_t pid;
/* The name of the binary. */
char comm[TASK_COMM_LEN];
/* The path of the sysctl, relative to /proc/sys/.
* The longest path observed is 64 bytes:
* net/ipv4/conf/123456789012345/igmpv3_unsolicited_report_interval */
char path[80];
/* The value of the sysctl just before the write.
* The longest value observed is net.core.netdev_rss_key which
* contains 155 bytes. */
char current[160];
/* The new value being written into the sysctl.
* same sizing as 'current' */
char newvalue[160];
};
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include "sysctl-write-event.h"
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 256 * 1024);
} written_sysctls SEC(".maps");
static bool my_streq(const char *s1, const char *s2, size_t l)
{
for (size_t i = 0; i < l; i++) {
if (s1[i] != s2[i])
return false;
if (!s1[i])
return true;
}
return true;
}
struct str {
char *s;
size_t l;
};
static long cut_last(u64 i, struct str *str)
{
char *s;
// Sanity checks for the preverifier
if (i >= str->l)
return 1;
i = str->l - i - 1;
s = str->s + i;
if (*s == 0)
return 0;
if (*s == '\n' || *s == '\r' || *s == ' ' || *s == '\t') {
*s = 0;
return 0;
}
return 1;
}
// Cut off trailing whitespace and newlines
static void chop(char *s, size_t l)
{
struct str str = { s, l };
bpf_loop(l, cut_last, &str, 0);
}
SEC("cgroup/sysctl")
int sysctl_monitor(struct bpf_sysctl *ctx)
{
int r;
// Ignore reads
if (!ctx->write)
return 1;
/* Declare the struct without contextually initializing it.
* This avoid zero-filling the struct, which would be a waste of
* resource and code size. Since we're sending an event even on failure,
* truncate the strings to zero size, in case we don't populate them. */
struct sysctl_write_event we;
we.errorcode = 0;
we.path[0] = 0;
we.comm[0] = 0;
we.current[0] = 0;
we.newvalue[0] = 0;
/* Set the simple values first */
we.pid = bpf_get_current_pid_tgid() >> 32;
// Only monitor net/
r = bpf_sysctl_get_name(ctx, we.path, sizeof(we.path), 0);
if (r < 0) {
we.errorcode = r;
goto send_event;
}
r = bpf_get_current_comm(we.comm, sizeof(we.comm));
if (r < 0) {
we.errorcode = r;
goto send_event;
}
r = bpf_sysctl_get_current_value(ctx, we.current, sizeof(we.current));
if (r < 0) {
we.errorcode = r;
goto send_event;
}
r = bpf_sysctl_get_new_value(ctx, we.newvalue, sizeof(we.newvalue));
if (r < 0) {
we.errorcode = r;
goto send_event;
}
// Both the kernel and userspace applications add a newline at the end,
// remove it from both strings
chop(we.current, sizeof(we.current));
chop(we.newvalue, sizeof(we.newvalue));
send_event:
// If new value is the same, ignore it
if (r < 0 || !my_streq(we.current, we.newvalue, sizeof(we.current)))
bpf_ringbuf_output(&written_sysctls, &we, sizeof(we), 0);
return 1;
}
char _license[] SEC("license") = "GPL";
#define _GNU_SOURCE
#include <stdio.h>
#include <fcntl.h>
#include <signal.h>
#include <errno.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include "sysctl-write-event.h"
#include "sysctl_monitor_bpf.skel.h"
#define CGROUP_MOUNT_DFLT "/sys/fs/cgroup"
struct ring_buffer *rb;
static void int_exit(int sig)
{
int cgfd = open(CGROUP_MOUNT_DFLT, O_PATH | O_DIRECTORY | O_CLOEXEC);
if (cgfd >= 0) {
bpf_prog_detach(cgfd, BPF_CGROUP_SYSCTL);
close(cgfd);
}
}
static int log_sysctl_writes(void *ctx, void *data, size_t data_sz)
{
struct sysctl_write_event *we = data;
if (we->errorcode)
printf("Sysctl monitor BPF returned error: %d\n", we->errorcode);
else
printf("%s[%d] updated '%s' from '%s' to '%s'\n", we->comm, we->pid, we->path, we->current, we->newvalue);
return 0;
}
static int attach_bpf(void)
{
struct sysctl_monitor_bpf *skel;
int progfd, cgfd;
int err;
cgfd = open(CGROUP_MOUNT_DFLT, O_PATH | O_DIRECTORY | O_CLOEXEC);
if (cgfd < 0) {
printf("failed to open cgroup mount point\n");
return 1;
}
skel = sysctl_monitor_bpf__open_and_load();
if (!skel) {
printf("failed to open and load BPF object\n");
return 1;
}
err = sysctl_monitor_bpf__attach(skel);
if (err) {
printf("failed to attach BPF program\n");
return 1;
}
rb = ring_buffer__new(bpf_map__fd(skel->maps.written_sysctls), log_sysctl_writes, NULL, NULL);
if (!rb) {
printf("failed to create ring buffer\n");
return 1;
}
progfd = bpf_program__fd(skel->progs.sysctl_monitor);
if (bpf_prog_attach(progfd, cgfd, BPF_CGROUP_SYSCTL, BPF_F_ALLOW_OVERRIDE) < 0) {
close(progfd);
return 1;
}
close(progfd);
return 0;
}
int main(int argc, char **argv)
{
int ret, cgfd;
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
signal(SIGQUIT, int_exit);
if (attach_bpf())
return 1;
// In business
while (1) {
ret = ring_buffer__poll(rb, 1000);
if (ret < 0) {
if (errno == EINTR)
break;
printf("Error polling ring buffer\n");
break;
}
}
cgfd = open(CGROUP_MOUNT_DFLT, O_PATH | O_DIRECTORY | O_CLOEXEC);
if (cgfd >= 0) {
bpf_prog_detach(cgfd, BPF_CGROUP_SYSCTL);
close(cgfd);
}
return 0;
}
#!/bin/bash
while sleep 1; do
sysctl -q fs.mount-max=$((100000 + RANDOM % 100))
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment