*
*/
/*
- Copyright (C) 2000, 2001, 2002 Kern Sibbald and John Walker
+ Copyright (C) 2000-2003 Kern Sibbald and John Walker
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
#include "bacula.h"
#include "jcr.h"
+/* This breaks Kern's #include rules, but I don't want to put it into bacula.h
+ * until it has been discussed with him */
+#include "bsd_queue.h"
+
/* Exported globals */
time_t watchdog_time; /* this has granularity of SLEEP_TIME */
-
-#define TIMEOUT_SIGNAL SIGUSR2
-#define SLEEP_TIME 30 /* examine things every 30 seconds */
+#define SLEEP_TIME 1 /* examine things every second */
/* Forward referenced functions */
static void *watchdog_thread(void *arg);
/* Static globals */
-static pthread_mutex_t mutex;
-static pthread_cond_t timer;
-static int quit;
-
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
+static bool quit;
+static bool wd_is_init = false;
-/*
- * Timeout signal comes here
- */
-static void timeout_handler(int sig)
-{
- return; /* thus interrupting the function */
-}
+/* Forward referenced callback functions */
+static pthread_t wd_tid;
+/* Static globals */
+static TAILQ_HEAD(/* no struct */, s_watchdog_t) wd_queue =
+ TAILQ_HEAD_INITIALIZER(wd_queue);
+static TAILQ_HEAD(/* no struct */, s_watchdog_t) wd_inactive =
+ TAILQ_HEAD_INITIALIZER(wd_inactive);
/*
* Start watchdog thread
int start_watchdog(void)
{
int stat;
- pthread_t wdid;
- struct sigaction sigtimer;
-
- sigtimer.sa_flags = 0;
- sigtimer.sa_handler = timeout_handler;
- sigfillset(&sigtimer.sa_mask);
- sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
+
+ Dmsg0(200, "Initialising NicB-hacked watchdog thread\n");
watchdog_time = time(NULL);
- if ((stat = pthread_mutex_init(&mutex, NULL)) != 0) {
- return stat;
- }
- if ((stat = pthread_cond_init(&timer, NULL)) != 0) {
- pthread_mutex_destroy(&mutex);
- return stat;
- }
- quit = FALSE;
- if ((stat = pthread_create(&wdid, NULL, watchdog_thread, (void *)NULL)) != 0) {
- pthread_mutex_destroy(&mutex);
- pthread_cond_destroy(&timer);
+ quit = false;
+ if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
return stat;
}
+ wd_is_init = true;
return 0;
}
int stop_watchdog(void)
{
int stat;
+ watchdog_t *p, *n;
- if ((stat = pthread_mutex_lock(&mutex)) != 0) {
- return stat;
+ if (!wd_is_init) {
+ return 0;
}
- quit = TRUE;
- if ((stat = pthread_cond_signal(&timer)) != 0) {
- pthread_mutex_unlock(&mutex);
- return stat;
+ Dmsg0(200, "Sending stop signal to NicB-hacked watchdog thread\n");
+ P(mutex);
+ quit = true;
+ stat = pthread_cond_signal(&timer);
+ V(mutex);
+
+ wd_is_init = false;
+
+ stat = pthread_join(wd_tid, NULL);
+
+ TAILQ_FOREACH_SAFE(p, &wd_queue, qe, n) {
+ TAILQ_REMOVE(&wd_queue, p, qe);
+ if (p->destructor != NULL) {
+ p->destructor(p);
+ }
+ free(p);
}
- if ((stat = pthread_mutex_unlock(&mutex)) != 0) {
- return stat;
+
+ TAILQ_FOREACH_SAFE(p, &wd_inactive, qe, n) {
+ TAILQ_REMOVE(&wd_inactive, p, qe);
+ if (p->destructor != NULL) {
+ p->destructor(p);
+ }
+ free(p);
}
- return 0;
-}
+ return stat;
+}
-/*
- * This is the actual watchdog thread.
- */
-static void *watchdog_thread(void *arg)
+watchdog_t *watchdog_new(void)
{
- struct timespec timeout;
- int stat;
- JCR *jcr;
- BSOCK *fd;
+ watchdog_t *wd = (watchdog_t *) malloc(sizeof(watchdog_t));
- Dmsg0(200, "Start watchdog thread\n");
- pthread_detach(pthread_self());
+ if (!wd_is_init) {
+ Emsg0(M_ABORT, 0, "BUG! watchdog_new called before start_watchdog\n");
+ }
- if ((stat = pthread_mutex_lock(&mutex)) != 0) {
+ if (wd == NULL) {
return NULL;
}
+ wd->one_shot = true;
+ wd->interval = 0;
+ wd->callback = NULL;
+ wd->destructor = NULL;
+ wd->data = NULL;
- for ( ;!quit; ) {
- struct timeval tv;
- struct timezone tz;
- time_t timer_start;
+ return wd;
+}
- Dmsg0(200, "Top of for loop\n");
+bool register_watchdog(watchdog_t *wd)
+{
+ if (!wd_is_init) {
+ Emsg0(M_ABORT, 0, "BUG! register_watchdog called before start_watchdog\n");
+ }
+ if (wd->callback == NULL) {
+ Emsg1(M_ABORT, 0, "BUG! Watchdog %p has NULL callback\n", wd);
+ }
+ if (wd->interval == 0) {
+ Emsg1(M_ABORT, 0, "BUG! Watchdog %p has zero interval\n", wd);
+ }
- watchdog_time = time(NULL); /* update timer */
+ P(mutex);
+ wd->next_fire = watchdog_time + wd->interval;
+ TAILQ_INSERT_TAIL(&wd_queue, wd, qe);
+ Dmsg3(200, "Registered watchdog %p, interval %d%s\n",
+ wd, wd->interval, wd->one_shot ? " one shot" : "");
+ V(mutex);
- /* Walk through all JCRs checking if any one is
- * blocked for more than specified max time.
- */
- lock_jcr_chain();
- for (jcr=NULL; (jcr=get_next_jcr(jcr)); ) {
- free_locked_jcr(jcr);
- if (jcr->JobId == 0) {
- continue;
- }
- fd = jcr->store_bsock;
- if (fd) {
- timer_start = fd->timer_start;
- if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, _(
-"Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
- watchdog_time - timer_start);
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
- }
- fd = jcr->file_bsock;
- if (fd) {
- timer_start = fd->timer_start;
- if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, _(
-"Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
- watchdog_time - timer_start);
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
- }
- fd = jcr->dir_bsock;
- if (fd) {
- timer_start = fd->timer_start;
- if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, _(
-"Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
- watchdog_time - timer_start);
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+ return false;
+}
+
+bool unregister_watchdog_unlocked(watchdog_t *wd)
+{
+ watchdog_t *p, *n;
+
+ if (!wd_is_init) {
+ Emsg0(M_ABORT, 0, "BUG! unregister_watchdog_unlocked called before start_watchdog\n");
+ }
+
+ TAILQ_FOREACH_SAFE(p, &wd_queue, qe, n) {
+ if (wd == p) {
+ TAILQ_REMOVE(&wd_queue, wd, qe);
+ Dmsg1(200, "Unregistered watchdog %p\n", wd);
+ return true;
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(p, &wd_inactive, qe, n) {
+ if (wd == p) {
+ TAILQ_REMOVE(&wd_inactive, wd, qe);
+ Dmsg1(200, "Unregistered inactive watchdog %p\n", wd);
+ return true;
+ }
+ }
+
+ Dmsg1(200, "Failed to unregister watchdog %p\n", wd);
+
+ return false;
+}
+
+bool unregister_watchdog(watchdog_t *wd)
+{
+ bool ret;
+
+ if (!wd_is_init) {
+ Emsg0(M_ABORT, 0, "BUG! unregister_watchdog called before start_watchdog\n");
+ }
+
+ P(mutex);
+ ret = unregister_watchdog_unlocked(wd);
+ V(mutex);
+
+ return ret;
+}
+
+static void *watchdog_thread(void *arg)
+{
+ Dmsg0(200, "NicB-reworked watchdog thread entered\n");
+
+ while (true) {
+ watchdog_t *p, *n;
+
+ P(mutex);
+ if (quit) {
+ V(mutex);
+ break;
+ }
+
+ watchdog_time = time(NULL);
+
+ TAILQ_FOREACH_SAFE(p, &wd_queue, qe, n) {
+ if (p->next_fire < watchdog_time) {
+ /* Run the callback */
+ p->callback(p);
+
+ /* Reschedule (or move to inactive list if it's a one-shot timer) */
+ if (p->one_shot) {
+ TAILQ_REMOVE(&wd_queue, p, qe);
+ TAILQ_INSERT_TAIL(&wd_inactive, p, qe);
+ } else {
+ p->next_fire = watchdog_time + p->interval;
}
}
-
}
- unlock_jcr_chain();
-
- gettimeofday(&tv, &tz);
- timeout.tv_nsec = 0;
- timeout.tv_sec = tv.tv_sec + SLEEP_TIME;
-
- Dmsg1(200, "pthread_cond_timedwait sec=%d\n", timeout.tv_sec);
-#ifdef xxxxxxxxxxxxxxx_was_HAVE_CYGWIN
- /* CYGWIN dies with a page fault the second
- * time that pthread_cond_timedwait() is called
- * so fake it out.
- */
- sleep(SLEEP_TIME);
-#else
- stat = pthread_cond_timedwait(&timer, &mutex, &timeout);
- Dmsg1(200, "pthread_cond_timedwait stat=%d\n", stat);
-#endif
-
- } /* end of big for loop */
-
- Dmsg0(200, "End watchdog\n");
+ V(mutex);
+ bmicrosleep(SLEEP_TIME, 0);
+ }
+
+ Dmsg0(200, "NicB-reworked watchdog thread exited\n");
+
return NULL;
}