Skip to content

Instantly share code, notes, and snippets.

@macdice
Created March 21, 2016 08:07
Show Gist options
  • Save macdice/6ec9173a1130c8a9a738 to your computer and use it in GitHub Desktop.
Save macdice/6ec9173a1130c8a9a738 to your computer and use it in GitHub Desktop.
diff --git a/configure b/configure
index 24655dc..e239641 100755
--- a/configure
+++ b/configure
@@ -10193,7 +10193,7 @@ fi
## Header files
##
-for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/event.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
do :
as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -12425,7 +12425,7 @@ fi
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
-for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
+for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.in b/configure.in
index c564a76..ef9d450 100644
--- a/configure.in
+++ b/configure.in
@@ -1183,7 +1183,7 @@ AC_SUBST(UUID_LIBS)
##
dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/event.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
# On BSD, test for net/if.h will fail unless sys/socket.h
# is included first.
@@ -1432,7 +1432,7 @@ PGAC_FUNC_WCSTOMBS_L
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
-AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l])
+AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l])
AC_REPLACE_FUNCS(fseeko)
case $host_os in
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index fd9a7bc..03acffe 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -44,6 +44,9 @@
#ifdef HAVE_SYS_EPOLL_H
#include <sys/epoll.h>
#endif
+#ifdef HAVE_SYS_EVENT_H
+#include <sys/event.h>
+#endif
#ifdef HAVE_POLL_H
#include <poll.h>
#endif
@@ -68,10 +71,12 @@
* useful to manually specify the used primitive. If desired, just add a
* define somewhere before this block.
*/
-#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || defined(WAIT_USE_SELECT) || defined(WAIT_USE_WIN32)
+#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_POLL) || defined(WAIT_USE_SELECT) || defined(WAIT_USE_WIN32)
/* don't overwrite manual choice */
#elif defined(HAVE_SYS_EPOLL_H)
#define WAIT_USE_EPOLL
+#elif defined(HAVE_KQUEUE)
+#define WAIT_USE_KQUEUE
#elif defined(HAVE_POLL)
#define WAIT_USE_POLL
#elif HAVE_SYS_SELECT_H
@@ -107,6 +112,10 @@ struct WaitEventSet
int epoll_fd;
/* epoll_wait returns events in a user provided arrays, allocate once */
struct epoll_event *epoll_ret_events;
+#elif defined(WAIT_USE_KQUEUE)
+ int kqueue_fd;
+ /* kevent returns events in a user provided arrays, allocate once */
+ struct kevent *kqueue_ret_events;
#elif defined(WAIT_USE_POLL)
/* poll expects events to be waited on every poll() call, prepare once */
struct pollfd *pollfds;
@@ -135,6 +144,8 @@ static void drainSelfPipe(void);
#if defined(WAIT_USE_EPOLL)
static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
+#elif defined(WAIT_USE_KQUEUE)
+static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int action);
#elif defined(WAIT_USE_POLL)
static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
#elif defined(WAIT_USE_WIN32)
@@ -491,6 +502,8 @@ CreateWaitEventSet(MemoryContext context, int nevents)
#if defined(WAIT_USE_EPOLL)
sz += sizeof(struct epoll_event) * nevents;
+#elif defined(WAIT_USE_KQUEUE)
+ sz += sizeof(struct kevent) * nevents;
#elif defined(WAIT_USE_POLL)
sz += sizeof(struct pollfd) * nevents;
#elif defined(WAIT_USE_WIN32)
@@ -509,6 +522,9 @@ CreateWaitEventSet(MemoryContext context, int nevents)
#if defined(WAIT_USE_EPOLL)
set->epoll_ret_events = (struct epoll_event *) data;
data += sizeof(struct epoll_event) * nevents;
+#elif defined(WAIT_USE_KQUEUE)
+ set->kqueue_ret_events = (struct kevent *) data;
+ data += sizeof(struct kevent) * nevents;
#elif defined(WAIT_USE_POLL)
set->pollfds = (struct pollfd *) data;
data += sizeof(struct pollfd) * nevents;
@@ -524,6 +540,10 @@ CreateWaitEventSet(MemoryContext context, int nevents)
set->epoll_fd = epoll_create(nevents);
if (set->epoll_fd < 0)
elog(ERROR, "epoll_create failed: %m");
+#elif defined(WAIT_USE_KQUEUE)
+ set->kqueue_fd = kqueue();
+ if (set->kqueue_fd < 0)
+ elog(ERROR, "kqueue failed: %m");
#elif defined(WAIT_USE_WIN32)
/*
@@ -549,6 +569,8 @@ FreeWaitEventSet(WaitEventSet *set)
{
#if defined(WAIT_USE_EPOLL)
close(set->epoll_fd);
+#elif defined(WAIT_USE_KQUEUE)
+ close(set->kqueue_fd);
#elif defined(WAIT_USE_WIN32)
WaitEvent *cur_event;
@@ -638,6 +660,8 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch)
/* perform wait primitive specific initialization, if needed */
#if defined(WAIT_USE_EPOLL)
WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
+#elif defined(WAIT_USE_KQUEUE)
+ WaitEventAdjustKqueue(set, event, EV_ADD);
#elif defined(WAIT_USE_POLL)
WaitEventAdjustPoll(set, event);
#elif defined(WAIT_USE_SELECT)
@@ -696,6 +720,8 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
#if defined(WAIT_USE_EPOLL)
WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
+#elif defined(WAIT_USE_KQUEUE)
+ WaitEventAdjustKqueue(set, event, EV_ADD);
#elif defined(WAIT_USE_POLL)
WaitEventAdjustPoll(set, event);
#elif defined(WAIT_USE_SELECT)
@@ -897,6 +923,56 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
}
#endif
+#if defined(WAIT_USE_KQUEUE)
+
+/*
+ * action can be EV_ADD or EV_DELETE. EV_ADD is used for both adding and
+ * modifying, and EV_DELETE is not used yet.
+ */
+static void
+WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int action)
+{
+ int rc;
+ struct kevent k_ev;
+
+ k_ev.ident = event->fd;
+ k_ev.filter = 0;
+ k_ev.flags = action | EV_CLEAR;
+ k_ev.fflags = 0;
+ k_ev.data = 0;
+ k_ev.udata = event;
+
+ Assert(event->fd >= 0);
+ if (event->events == WL_LATCH_SET)
+ {
+ Assert(set->latch != NULL);
+ k_ev.filter = EVFILT_READ;
+ }
+ else if (event->events == WL_POSTMASTER_DEATH)
+ {
+ k_ev.filter = EVFILT_READ;
+ }
+ else
+ {
+ Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+
+ /* TODO: We need to register two kevents if both bits are set! */
+ /* TODO: But then how will we merge the resulting received events? */
+ if (event->events & WL_SOCKET_READABLE)
+ k_ev.filter = EVFILT_READ;
+ else
+ k_ev.filter = EVFILT_WRITE;
+ }
+
+ rc = kevent(set->kqueue_fd, &k_ev, 1, NULL, 0, NULL);
+ if (rc < 0)
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("kevent() failed %d %d: %m", set->kqueue_fd, event->fd)));
+}
+
+#endif
+
#if defined(WAIT_USE_WIN32)
static void
WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
@@ -1060,6 +1136,140 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
return returned_events;
}
+#elif defined(WAIT_USE_KQUEUE)
+
+/*
+ * Wait using FreeBSD kqueue(2)/kevent(2). Also available on other BSD-family
+ * systems including MacOSX.
+ *
+ * This is the preferrable wait method for systems that have it, as several
+ * readiness notifications are delivered, without having to iterate through
+ * all of set->events.
+ */
+static int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ int rc;
+ WaitEvent *cur_event;
+ struct kevent *cur_kqueue_event;
+ struct timespec timeout;
+ struct timespec *timeout_p;
+
+ if (cur_timeout < 0)
+ timeout_p = NULL;
+ else
+ {
+ timeout.tv_sec = cur_timeout / 1000;
+ timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
+ timeout_p = &timeout;
+ }
+
+ /* Sleep */
+ rc = kevent(set->kqueue_fd, NULL, 0,
+ set->kqueue_ret_events, nevents,
+ timeout_p);
+
+ /* Check return code */
+ if (rc < 0)
+ {
+ /* EINTR is okay, otherwise complain */
+ if (errno != EINTR)
+ {
+ waiting = false;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("kevent() failed while trying to wait: %m")));
+ }
+ return 0;
+ }
+ else if (rc == 0)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ /*
+ * At least one event occurred, iterate over the returned kqueue events
+ * until they're either all processed, or we've returned all the events
+ * the caller desired.
+ */
+ for (cur_kqueue_event = set->kqueue_ret_events;
+ cur_kqueue_event < (set->kqueue_ret_events + rc) &&
+ returned_events < nevents;
+ cur_kqueue_event++)
+ {
+ /* epoll's data pointer is set to the associated WaitEvent */
+ cur_event = (WaitEvent *) cur_kqueue_event->udata;
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET &&
+ cur_kqueue_event->flags & (EV_EOF | EVFILT_READ))
+ {
+ /* There's data in the self-pipe, clear it. */
+ drainSelfPipe();
+
+ if (set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH &&
+ cur_kqueue_event->flags & (EVFILT_READ | EV_EOF))
+ {
+ /*
+ * We expect an EV_EOF when the remote end is closed, but
+ * because we don't expect the pipe to become readable or to have
+ * any errors either, treat those cases as postmaster death, too.
+ *
+ * As explained in the WAIT_USE_SELECT implementation, select(2)
+ * may spuriously return. Be paranoid about that here too, a
+ * spurious WL_POSTMASTER_DEATH would be painful.
+ */
+ if (!PostmasterIsAlive())
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ {
+ Assert(cur_event->fd >= 0);
+
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (cur_kqueue_event->flags & (EV_EOF | EVFILT_READ)))
+ {
+ /* readable, or EOF */
+ occurred_events->events |= WL_SOCKET_READABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (cur_kqueue_event->flags & (EV_EOF | EVFILT_WRITE)))
+ {
+ /* writable, or EOF */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events->fd = cur_event->fd;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ }
+
+ return returned_events;
+}
+
#elif defined(WAIT_USE_POLL)
/*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c72635c..e319f1d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -279,6 +279,9 @@
/* Define to 1 if you have isinf(). */
#undef HAVE_ISINF
+/* Define to 1 if you have the `kqueue' function. */
+#undef HAVE_KQUEUE
+
/* Define to 1 if you have the <langinfo.h> header file. */
#undef HAVE_LANGINFO_H
@@ -533,6 +536,9 @@
/* Define to 1 if you have the <sys/epoll.h> header file. */
#undef HAVE_SYS_EPOLL_H
+/* Define to 1 if you have the <sys/event.h> header file. */
+#undef HAVE_SYS_EVENT_H
+
/* Define to 1 if you have the <sys/ioctl.h> header file. */
#undef HAVE_SYS_IOCTL_H
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment