Skip to content

Instantly share code, notes, and snippets.

@voutilad
Last active June 8, 2020 10:50
Show Gist options
  • Save voutilad/6e12610390697fd30a649cfdea2d4bf8 to your computer and use it in GitHub Desktop.
Save voutilad/6e12610390697fd30a649cfdea2d4bf8 to your computer and use it in GitHub Desktop.
vmd(8) thread safety patch - solves event base corruption so your non-OpenBSD guests live longer, healthier lives
Index: i8253.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/i8253.c,v
retrieving revision 1.31
diff -u -p -u -p -r1.31 i8253.c
--- i8253.c 30 Nov 2019 00:51:29 -0000 1.31
+++ i8253.c 7 Jun 2020 13:53:52 -0000
@@ -30,6 +30,7 @@
#include "i8253.h"
#include "proc.h"
+#include "vmd.h"
#include "vmm.h"
#include "atomicio.h"
@@ -43,6 +44,40 @@ extern char *__progname;
*/
struct i8253_channel i8253_channel[3];
+static struct vm_dev_pipe dev_pipe;
+
+/*
+ * i8253_pipe_handler
+ *
+ * Reads a message off the pipe, expecting one that corresponds to a
+ * reset request for a specific channel.
+ *
+ */
+static void
+i8253_pipe_dispatch(int fd, short event, void *arg)
+{
+ size_t n;
+ uint8_t msg;
+
+ n = read(fd, &msg, sizeof(msg));
+ if (n != sizeof(msg))
+ fatal("failed to read from device pipe");
+
+ switch (msg) {
+ case I8253_RESET_CHAN_0:
+ i8253_reset(0);
+ break;
+ case I8253_RESET_CHAN_1:
+ i8253_reset(1);
+ break;
+ case I8253_RESET_CHAN_2:
+ i8253_reset(2);
+ break;
+ default:
+ fatal("unknown pipe message %u", msg);
+ }
+}
+
/*
* i8253_init
*
@@ -77,6 +112,9 @@ i8253_init(uint32_t vm_id)
evtimer_set(&i8253_channel[0].timer, i8253_fire, &i8253_channel[0]);
evtimer_set(&i8253_channel[1].timer, i8253_fire, &i8253_channel[1]);
evtimer_set(&i8253_channel[2].timer, i8253_fire, &i8253_channel[2]);
+
+ vm_pipe_init(&dev_pipe, i8253_pipe_dispatch);
+ event_add(&dev_pipe.read_ev, NULL);
}
/*
@@ -271,7 +309,7 @@ vcpu_exit_i8253(struct vm_run_params *vr
sel, i8253_channel[sel].mode,
i8253_channel[sel].start);
- i8253_reset(sel);
+ vm_pipe_send(&dev_pipe, sel);
}
} else {
if (i8253_channel[sel].rbs) {
Index: mc146818.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/mc146818.c,v
retrieving revision 1.21
diff -u -p -u -p -r1.21 mc146818.c
--- mc146818.c 30 Nov 2019 00:51:29 -0000 1.21
+++ mc146818.c 7 Jun 2020 13:53:52 -0000
@@ -63,6 +63,34 @@ struct mc146818 {
struct mc146818 rtc;
+static struct vm_dev_pipe dev_pipe;
+
+static void rtc_reschedule_per(void);
+
+/*
+ * mc146818_pipe_handler
+ *
+ * Reads a message off the pipe, expecting a request to reschedule periodic
+ * interrupt rate.
+ */
+static void
+mc146818_pipe_dispatch(int fd, short event, void *arg)
+{
+ size_t n;
+ uint8_t msg;
+
+ n = read(fd, &msg, sizeof(msg));
+ if (n != sizeof(msg))
+ fatal("failed to read from device pipe");
+
+ if (msg == MC146818_RESCHEDULE_PER) {
+ log_debug("%s: rescheduling periodic timer", __func__);
+ rtc_reschedule_per();
+ } else {
+ fatal("unknown pipe message %u", msg);
+ }
+}
+
/*
* rtc_updateregs
*
@@ -175,6 +203,9 @@ mc146818_init(uint32_t vm_id, uint64_t m
evtimer_add(&rtc.sec, &rtc.sec_tv);
evtimer_set(&rtc.per, rtc_fireper, (void *)(intptr_t)rtc.vm_id);
+
+ vm_pipe_init(&dev_pipe, mc146818_pipe_dispatch);
+ event_add(&dev_pipe.read_ev, NULL);
}
/*
@@ -217,7 +248,7 @@ rtc_update_rega(uint32_t data)
rtc.regs[MC_REGA] = data;
if ((rtc.regs[MC_REGA] ^ data) & 0x0f)
- rtc_reschedule_per();
+ vm_pipe_send(&dev_pipe, MC146818_RESCHEDULE_PER);
}
/*
@@ -240,7 +271,7 @@ rtc_update_regb(uint32_t data)
rtc.regs[MC_REGB] = data;
if (data & MC_REGB_PIE)
- rtc_reschedule_per();
+ vm_pipe_send(&dev_pipe, MC146818_RESCHEDULE_PER);
}
/*
Index: ns8250.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/ns8250.c,v
retrieving revision 1.25
diff -u -p -u -p -r1.25 ns8250.c
--- ns8250.c 11 Dec 2019 06:45:16 -0000 1.25
+++ ns8250.c 7 Jun 2020 13:53:52 -0000
@@ -37,8 +37,40 @@
extern char *__progname;
struct ns8250_dev com1_dev;
+/* Flags to distinguish calling threads to com_rcv */
+#define NS8250_DEV_THREAD 0
+#define NS8250_CPU_THREAD 1
+
+static struct vm_dev_pipe dev_pipe;
+
static void com_rcv_event(int, short, void *);
-static void com_rcv(struct ns8250_dev *, uint32_t, uint32_t);
+static void com_rcv(struct ns8250_dev *, uint32_t, uint32_t, int8_t);
+
+/*
+ * ns8250_pipe_dispatch
+ *
+ * Reads a message off the pipe, expecting a reguest to reset events after a
+ * zero-byte read from the com device.
+ */
+static void
+ns8250_pipe_dispatch(int fd, short event, void *arg)
+{
+ size_t n;
+ uint8_t msg;
+
+ n = read(fd, &msg, sizeof(msg));
+ if (n != sizeof(msg))
+ fatal("failed to read from device pipe");
+
+ if (msg == NS8250_ZERO_READ) {
+ log_debug("%s: resetting events after zero byte read", __func__);
+ event_del(&com1_dev.event);
+ event_add(&com1_dev.wake, NULL);
+ } else {
+ fatal("unknown pipe message %u", msg);
+ }
+}
+
/*
* ratelimit
@@ -55,10 +87,15 @@ static void
ratelimit(int fd, short type, void *arg)
{
/* Set TXRDY and clear "no pending interrupt" */
+ mutex_lock(&com1_dev.mutex);
com1_dev.regs.iir |= IIR_TXRDY;
com1_dev.regs.iir &= ~IIR_NOPEND;
+ mutex_unlock(&com1_dev.mutex);
+
vcpu_assert_pic_irq(com1_dev.vmid, 0, com1_dev.irq);
vcpu_deassert_pic_irq(com1_dev.vmid, 0, com1_dev.irq);
+
+ evtimer_add(&com1_dev.rate, &com1_dev.rate_tv);
}
void
@@ -72,6 +109,7 @@ ns8250_init(int fd, uint32_t vmid)
errno = ret;
fatal("could not initialize com1 mutex");
}
+
com1_dev.fd = fd;
com1_dev.irq = 4;
com1_dev.portid = NS8250_COM1;
@@ -112,6 +150,10 @@ ns8250_init(int fd, uint32_t vmid)
timerclear(&com1_dev.rate_tv);
com1_dev.rate_tv.tv_usec = 10000;
evtimer_set(&com1_dev.rate, ratelimit, NULL);
+ evtimer_add(&com1_dev.rate, &com1_dev.rate_tv);
+
+ vm_pipe_init(&dev_pipe, ns8250_pipe_dispatch);
+ event_add(&dev_pipe.read_ev, NULL);
}
static void
@@ -137,7 +179,7 @@ com_rcv_event(int fd, short kind, void *
if (com1_dev.regs.lsr & LSR_RXRDY)
com1_dev.rcv_pending = 1;
else {
- com_rcv(&com1_dev, (uintptr_t)arg, 0);
+ com_rcv(&com1_dev, (uintptr_t)arg, 0, NS8250_DEV_THREAD);
/* If pending interrupt, inject */
if ((com1_dev.regs.iir & IIR_NOPEND) == 0) {
@@ -178,10 +220,10 @@ com_rcv_handle_break(struct ns8250_dev *
* com_rcv
*
* Move received byte into com data register.
- * Must be called with the mutex of the com device acquired
+ * Must be called with the mutex of the com device acquired.
*/
static void
-com_rcv(struct ns8250_dev *com, uint32_t vm_id, uint32_t vcpu_id)
+com_rcv(struct ns8250_dev *com, uint32_t vm_id, uint32_t vcpu_id, int8_t thread)
{
char buf[2];
ssize_t sz;
@@ -201,8 +243,13 @@ com_rcv(struct ns8250_dev *com, uint32_t
if (errno != EAGAIN)
log_warn("unexpected read error on com device");
} else if (sz == 0) {
- event_del(&com->event);
- event_add(&com->wake, NULL);
+ if (thread == NS8250_DEV_THREAD) {
+ event_del(&com->event);
+ event_add(&com->wake, NULL);
+ } else {
+ /* Called by vcpu thread, use pipe for event changes */
+ vm_pipe_send(&dev_pipe, NS8250_ZERO_READ);
+ }
return;
} else if (sz != 1 && sz != 2)
log_warnx("unexpected read return value %zd on com device", sz);
@@ -255,13 +302,12 @@ vcpu_process_com_data(struct vm_exit *ve
com1_dev.byte_out++;
if (com1_dev.regs.ier & IER_ETXRDY) {
- /* Limit output rate if needed */
- if (com1_dev.pause_ct > 0 &&
- com1_dev.byte_out % com1_dev.pause_ct == 0) {
- evtimer_add(&com1_dev.rate,
- &com1_dev.rate_tv);
- } else {
- /* Set TXRDY and clear "no pending interrupt" */
+ /* Set TXRDY and clear "no pending interrupt"
+ * only if we're not at the pause point. Otherwise
+ * the rate limiter timer event will handle it.
+ */
+ if (!(com1_dev.pause_ct > 0 &&
+ com1_dev.byte_out % com1_dev.pause_ct == 0)) {
com1_dev.regs.iir |= IIR_TXRDY;
com1_dev.regs.iir &= ~IIR_NOPEND;
}
@@ -300,7 +346,7 @@ vcpu_process_com_data(struct vm_exit *ve
com1_dev.regs.iir = 0x1;
if (com1_dev.rcv_pending)
- com_rcv(&com1_dev, vm_id, vcpu_id);
+ com_rcv(&com1_dev, vm_id, vcpu_id, NS8250_CPU_THREAD);
}
/* If pending interrupt, make sure it gets injected */
@@ -672,6 +718,7 @@ ns8250_stop()
{
if(event_del(&com1_dev.event))
log_warn("could not delete ns8250 event handler");
+ event_del(&com1_dev.wake);
evtimer_del(&com1_dev.rate);
}
Index: vm.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vm.c,v
retrieving revision 1.57
diff -u -p -u -p -r1.57 vm.c
--- vm.c 30 Apr 2020 03:50:53 -0000 1.57
+++ vm.c 7 Jun 2020 13:53:53 -0000
@@ -2243,3 +2243,50 @@ translate_gva(struct vm_exit* exit, uint
return (0);
}
+
+/*
+ * vm_pipe_init
+ *
+ * Initialize a vm_dev_pipe, setting up its file descriptors and its
+ * event structure with the given callback.
+ *
+ * Parameters:
+ * p: pointer to vm_dev_pipe struct to initizlize
+ * cb: callback to use for READ events on the read end of the pipe
+ */
+void
+vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
+{
+ int ret;
+ int fds[2];
+
+ memset(p, 0, sizeof(struct vm_dev_pipe));
+
+ ret = pipe(fds);
+ if (ret) {
+ fatal("failed to create vm_dev_pipe pipe");
+ }
+ p->read = fds[0];
+ p->write = fds[1];
+
+ event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
+}
+
+/*
+ * vm_pipe_send
+ *
+ * Send a message to an emulated device vie the provided vm_dev_pipe.
+ * Messages should be of a value from enum pipe_msg_type.
+ *
+ * Parameters:
+ * p: pointer to initialized vm_dev_pipe
+ * msg: message (from pipe_msg_type enum) to send in the channel
+ */
+void
+vm_pipe_send(struct vm_dev_pipe *p, uint8_t msg)
+{
+ size_t n;
+ n = write(p->write, &msg, sizeof(msg));
+ if (n != sizeof(msg))
+ fatal("failed to write to device pipe");
+}
Index: vmd.h
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vmd.h,v
retrieving revision 1.98
diff -u -p -u -p -r1.98 vmd.h
--- vmd.h 12 Dec 2019 03:53:38 -0000 1.98
+++ vmd.h 7 Jun 2020 13:53:53 -0000
@@ -354,6 +354,20 @@ struct vmd {
int vmd_ptmfd;
};
+struct vm_dev_pipe {
+ int read;
+ int write;
+ struct event read_ev;
+};
+
+enum pipe_msg_type {
+ I8253_RESET_CHAN_0 = 0,
+ I8253_RESET_CHAN_1 = 1,
+ I8253_RESET_CHAN_2 = 2,
+ NS8250_ZERO_READ,
+ MC146818_RESCHEDULE_PER
+};
+
static inline struct sockaddr_in *
ss2sin(struct sockaddr_storage *ss)
{
@@ -442,6 +456,8 @@ int vmm_pipe(struct vmd_vm *, int, void
/* vm.c */
int start_vm(struct vmd_vm *, int);
__dead void vm_shutdown(unsigned int);
+void vm_pipe_init(struct vm_dev_pipe *, void (*)(int, short, void *));
+void vm_pipe_send(struct vm_dev_pipe *, uint8_t);
/* control.c */
int config_init(struct vmd *);
@voutilad
Copy link
Author

voutilad commented Jun 5, 2020

TODO:

  • fix some doc mistakes
  • some whitespace issues
  • figure out if my vcpu_assert_pic_irq/vcpu_deassert_pic_irq VM paused state check is sound or subject to race conditions (will the vm state change between entry of unpause vm function)

@voutilad
Copy link
Author

voutilad commented Jun 6, 2020

Fixed some whitespace and doc stuff.

Pausing under load where the guest CPU is 100% in userland and waiting on a timer just doesn't work yet, even in existing vmd(8). We need a way to properly interrupt the guest. In some hypervisors, there's guest support for officially "pausing", but there's supposed to be vmx/svm support for a guest pause register I believe. (Saw something in intel's docs on vmx...and I think KVM uses it?)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment