Last active
December 11, 2015 07:58
-
-
Save splbio/4570063 to your computer and use it in GitHub Desktop.
Add code to watchdog to time the watchdog command program, carp when the program takes too long.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The purpose of this is to allow system integrators to tune their watchdogs and | |
get advanced notice if they are behaving poorly. | |
The following facilities are added: | |
- Warn if the watchdog program takes too long. | |
- Disable activation of the system watchdog so that one can test the watchdogd script | |
without potentially rebooting the system. | |
Example: | |
/usr/trees/head/usr.sbin/watchdogd # ./watchdogd -d -n -w -e "sleep 1" | |
watchdogd: mlockall failed: Cannot allocate memory | |
watchdogd: Watchdog program: 'sleep 1' took too long: 1.010894 seconds >= 1 seconds threshhold | |
watchdogd: Watchdog program: 'sleep 1' took too long: 1.010636 seconds >= 1 seconds threshhold | |
watchdogd: Watchdog program: 'sleep 1' took too long: 1.010700 seconds >= 1 seconds threshhold | |
^C | |
/usr/trees/head/usr.sbin/watchdogd # ./watchdogd -d -n -w -e "sleep 0.9" | |
watchdogd: mlockall failed: Cannot allocate memory | |
... doesn't complain ... | |
Index: watchdogd.c | |
=================================================================== | |
--- watchdogd.c (revision 245626) | |
+++ watchdogd.c (working copy) | |
@@ -66,8 +66,13 @@ | |
static u_int timeout = WD_TO_16SEC; | |
static u_int passive = 0; | |
static int is_daemon = 0; | |
+static int is_dry_run = 0; /* do not arm the watchdog, only | |
+ report on timing of the watch | |
+ program */ | |
+static int do_timedog = 0; | |
static int fd = -1; | |
static int nap = 1; | |
+static int carp_thresh_seconds = -1; | |
static char *test_cmd = NULL; | |
/* | |
@@ -90,7 +95,7 @@ | |
if (rtprio(RTP_SET, 0, &rtp) == -1) | |
err(EX_OSERR, "rtprio"); | |
- if (watchdog_init() == -1) | |
+ if (!is_dry_run && watchdog_init() == -1) | |
errx(EX_SOFTWARE, "unable to initialize watchdog"); | |
if (is_daemon) { | |
@@ -156,6 +161,9 @@ | |
watchdog_init(void) | |
{ | |
+ if (is_dry_run) | |
+ return 0; | |
+ | |
fd = open("/dev/" _PATH_WATCHDOG, O_RDWR); | |
if (fd >= 0) | |
return (0); | |
@@ -164,26 +172,90 @@ | |
} | |
/* | |
+ * If we are doing timing, then get the time. | |
+ */ | |
+static int | |
+watchdog_getuptime(struct timespec *tp) | |
+{ | |
+ int error; | |
+ | |
+ if (!do_timedog) | |
+ return 0; | |
+ | |
+ error = clock_gettime(CLOCK_UPTIME_FAST, tp); | |
+ if (error) | |
+ warn("clock_gettime"); | |
+ return (error); | |
+} | |
+ | |
+static void | |
+watchdog_check_dogfunction_time(struct timespec *tp_start, | |
+ struct timespec *tp_end) | |
+{ | |
+ struct timeval tv_start, tv_end, tv; | |
+ const char *cmd_prefix, *cmd; | |
+ int nsec; | |
+ | |
+ if (!do_timedog) | |
+ return; | |
+ | |
+ TIMESPEC_TO_TIMEVAL(&tv_start, tp_start); | |
+ TIMESPEC_TO_TIMEVAL(&tv_end, tp_end); | |
+ timersub(&tv_end, &tv_start, &tv); | |
+ nsec = tv.tv_sec; | |
+ if (nsec < carp_thresh_seconds) | |
+ return; | |
+ | |
+ if (test_cmd) { | |
+ cmd_prefix = "Watchdog program"; | |
+ cmd = test_cmd; | |
+ } else { | |
+ cmd_prefix = "Watchdog operation"; | |
+ cmd = "stat(\"/etc\", &sb)"; | |
+ } | |
+ warnx("%s: '%s' took too long: " | |
+ "%d.%06ld seconds >= %d seconds threshhold", | |
+ cmd_prefix, cmd, nsec, (long)tv.tv_usec, carp_thresh_seconds); | |
+} | |
+ | |
+ | |
+/* | |
* Main program loop which is iterated every second. | |
*/ | |
static void | |
watchdog_loop(void) | |
{ | |
+ struct timespec ts_start, ts_end; | |
struct stat sb; | |
- int failed; | |
+ int error, failed; | |
while (end_program != 2) { | |
failed = 0; | |
+ error = watchdog_getuptime(&ts_start); | |
+ if (error) { | |
+ end_program = 1; | |
+ goto try_end; | |
+ } | |
+ | |
if (test_cmd != NULL) | |
failed = system(test_cmd); | |
else | |
failed = stat("/etc", &sb); | |
+ error = watchdog_getuptime(&ts_end); | |
+ if (error) { | |
+ end_program = 1; | |
+ goto try_end; | |
+ } | |
+ | |
+ watchdog_check_dogfunction_time(&ts_start, &ts_end); | |
+ | |
if (failed == 0) | |
watchdog_patpat(timeout|WD_ACTIVE); | |
sleep(nap); | |
+try_end: | |
if (end_program != 0) { | |
if (watchdog_onoff(0) == 0) { | |
end_program = 2; | |
@@ -203,6 +275,9 @@ | |
watchdog_patpat(u_int t) | |
{ | |
+ if (is_dry_run) | |
+ return 0; | |
+ | |
return ioctl(fd, WDIOCPATPAT, &t); | |
} | |
@@ -214,6 +289,10 @@ | |
watchdog_onoff(int onoff) | |
{ | |
+ /* fake successful watchdog op if a dry run */ | |
+ if (is_dry_run) | |
+ return 0; | |
+ | |
if (onoff) | |
return watchdog_patpat((timeout|WD_ACTIVE)); | |
else | |
@@ -247,7 +326,7 @@ | |
if (argv[0][c - 1] == 'd') | |
is_daemon = 1; | |
while ((c = getopt(argc, argv, | |
- is_daemon ? "I:de:s:t:?" : "dt:?")) != -1) { | |
+ is_daemon ? "I:de:ns:t:w?" : "dt:?")) != -1) { | |
switch (c) { | |
case 'I': | |
pidfile = optarg; | |
@@ -258,6 +337,9 @@ | |
case 'e': | |
test_cmd = strdup(optarg); | |
break; | |
+ case 'n': | |
+ is_dry_run = 1; | |
+ break; | |
#ifdef notyet | |
case 'p': | |
passive = 1; | |
@@ -286,12 +368,19 @@ | |
printf("Timeout is 2^%d nanoseconds\n", | |
timeout); | |
break; | |
+ case 'w': | |
+ do_timedog = 1; | |
+ break; | |
case '?': | |
default: | |
usage(); | |
/* NOTREACHED */ | |
} | |
} | |
+ | |
+ if (carp_thresh_seconds == -1) | |
+ carp_thresh_seconds = nap; | |
+ | |
if (argc != optind) | |
errx(EX_USAGE, "extra arguments."); | |
if (is_daemon && timeout < WD_TO_1SEC) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment