Skip to content

Instantly share code, notes, and snippets.

@splbio
Last active December 11, 2015 07:58
Show Gist options
  • Save splbio/4570063 to your computer and use it in GitHub Desktop.
Save splbio/4570063 to your computer and use it in GitHub Desktop.
Add code to watchdog to time the watchdog command program, carp when the program takes too long.
The purpose of this is to allow system integrators to tune their watchdogs and
get advanced notice if they are behaving poorly.
The following facilities are added:
- Warn if the watchdog program takes too long.
- Disable activation of the system watchdog so that one can test the watchdogd script
without potentially rebooting the system.
Example:
/usr/trees/head/usr.sbin/watchdogd # ./watchdogd -d -n -w -e "sleep 1"
watchdogd: mlockall failed: Cannot allocate memory
watchdogd: Watchdog program: 'sleep 1' took too long: 1.010894 seconds >= 1 seconds threshhold
watchdogd: Watchdog program: 'sleep 1' took too long: 1.010636 seconds >= 1 seconds threshhold
watchdogd: Watchdog program: 'sleep 1' took too long: 1.010700 seconds >= 1 seconds threshhold
^C
/usr/trees/head/usr.sbin/watchdogd # ./watchdogd -d -n -w -e "sleep 0.9"
watchdogd: mlockall failed: Cannot allocate memory
... doesn't complain ...
Index: watchdogd.c
===================================================================
--- watchdogd.c (revision 245626)
+++ watchdogd.c (working copy)
@@ -66,8 +66,13 @@
static u_int timeout = WD_TO_16SEC;
static u_int passive = 0;
static int is_daemon = 0;
+static int is_dry_run = 0; /* do not arm the watchdog, only
+ report on timing of the watch
+ program */
+static int do_timedog = 0;
static int fd = -1;
static int nap = 1;
+static int carp_thresh_seconds = -1;
static char *test_cmd = NULL;
/*
@@ -90,7 +95,7 @@
if (rtprio(RTP_SET, 0, &rtp) == -1)
err(EX_OSERR, "rtprio");
- if (watchdog_init() == -1)
+ if (!is_dry_run && watchdog_init() == -1)
errx(EX_SOFTWARE, "unable to initialize watchdog");
if (is_daemon) {
@@ -156,6 +161,9 @@
watchdog_init(void)
{
+ if (is_dry_run)
+ return 0;
+
fd = open("/dev/" _PATH_WATCHDOG, O_RDWR);
if (fd >= 0)
return (0);
@@ -164,26 +172,90 @@
}
/*
+ * If we are doing timing, then get the time.
+ */
+static int
+watchdog_getuptime(struct timespec *tp)
+{
+ int error;
+
+ if (!do_timedog)
+ return 0;
+
+ error = clock_gettime(CLOCK_UPTIME_FAST, tp);
+ if (error)
+ warn("clock_gettime");
+ return (error);
+}
+
+static void
+watchdog_check_dogfunction_time(struct timespec *tp_start,
+ struct timespec *tp_end)
+{
+ struct timeval tv_start, tv_end, tv;
+ const char *cmd_prefix, *cmd;
+ int nsec;
+
+ if (!do_timedog)
+ return;
+
+ TIMESPEC_TO_TIMEVAL(&tv_start, tp_start);
+ TIMESPEC_TO_TIMEVAL(&tv_end, tp_end);
+ timersub(&tv_end, &tv_start, &tv);
+ nsec = tv.tv_sec;
+ if (nsec < carp_thresh_seconds)
+ return;
+
+ if (test_cmd) {
+ cmd_prefix = "Watchdog program";
+ cmd = test_cmd;
+ } else {
+ cmd_prefix = "Watchdog operation";
+ cmd = "stat(\"/etc\", &sb)";
+ }
+ warnx("%s: '%s' took too long: "
+ "%d.%06ld seconds >= %d seconds threshhold",
+ cmd_prefix, cmd, nsec, (long)tv.tv_usec, carp_thresh_seconds);
+}
+
+
+/*
* Main program loop which is iterated every second.
*/
static void
watchdog_loop(void)
{
+ struct timespec ts_start, ts_end;
struct stat sb;
- int failed;
+ int error, failed;
while (end_program != 2) {
failed = 0;
+ error = watchdog_getuptime(&ts_start);
+ if (error) {
+ end_program = 1;
+ goto try_end;
+ }
+
if (test_cmd != NULL)
failed = system(test_cmd);
else
failed = stat("/etc", &sb);
+ error = watchdog_getuptime(&ts_end);
+ if (error) {
+ end_program = 1;
+ goto try_end;
+ }
+
+ watchdog_check_dogfunction_time(&ts_start, &ts_end);
+
if (failed == 0)
watchdog_patpat(timeout|WD_ACTIVE);
sleep(nap);
+try_end:
if (end_program != 0) {
if (watchdog_onoff(0) == 0) {
end_program = 2;
@@ -203,6 +275,9 @@
watchdog_patpat(u_int t)
{
+ if (is_dry_run)
+ return 0;
+
return ioctl(fd, WDIOCPATPAT, &t);
}
@@ -214,6 +289,10 @@
watchdog_onoff(int onoff)
{
+ /* fake successful watchdog op if a dry run */
+ if (is_dry_run)
+ return 0;
+
if (onoff)
return watchdog_patpat((timeout|WD_ACTIVE));
else
@@ -247,7 +326,7 @@
if (argv[0][c - 1] == 'd')
is_daemon = 1;
while ((c = getopt(argc, argv,
- is_daemon ? "I:de:s:t:?" : "dt:?")) != -1) {
+ is_daemon ? "I:de:ns:t:w?" : "dt:?")) != -1) {
switch (c) {
case 'I':
pidfile = optarg;
@@ -258,6 +337,9 @@
case 'e':
test_cmd = strdup(optarg);
break;
+ case 'n':
+ is_dry_run = 1;
+ break;
#ifdef notyet
case 'p':
passive = 1;
@@ -286,12 +368,19 @@
printf("Timeout is 2^%d nanoseconds\n",
timeout);
break;
+ case 'w':
+ do_timedog = 1;
+ break;
case '?':
default:
usage();
/* NOTREACHED */
}
}
+
+ if (carp_thresh_seconds == -1)
+ carp_thresh_seconds = nap;
+
if (argc != optind)
errx(EX_USAGE, "extra arguments.");
if (is_daemon && timeout < WD_TO_1SEC)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment