/*
- * Bacula thread watchdog routine. General routine that monitors
- * the daemon and signals a thread if it is blocked on a BSOCK
- * too long. This prevents catastropic long waits -- generally
- * due to Windows "hanging" the app.
- *
- * Kern Sibbald, January MMII
- *
- */
-/*
- Copyright (C) 2000, 2001, 2002 Kern Sibbald and John Walker
+ Bacula® - The Network Backup Solution
+
+ Copyright (C) 2002-2011 Free Software Foundation Europe e.V.
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public License as
- published by the Free Software Foundation; either version 2 of
- the License, or (at your option) any later version.
+ The main author of Bacula is Kern Sibbald, with contributions from
+ many others, a complete list can be found in the file AUTHORS.
+ This program is Free Software; you can redistribute it and/or
+ modify it under the terms of version three of the GNU Affero General Public
+ License as published by the Free Software Foundation and included
+ in the file LICENSE.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
- MA 02111-1307, USA.
+ You should have received a copy of the GNU Affero General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+ Bacula® is a registered trademark of Kern Sibbald.
+ The licensor of Bacula is the Free Software Foundation Europe
+ (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
+ Switzerland, email:ftf@fsfeurope.org.
+*/
+/*
+ * Bacula thread watchdog routine. General routine that
+ * allows setting a watchdog timer with a callback that is
+ * called when the timer goes off.
+ *
+ * Kern Sibbald, January MMII
+ *
*/
#include "bacula.h"
#include "jcr.h"
/* Exported globals */
-time_t watchdog_time; /* this has granularity of SLEEP_TIME */
-
+utime_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
+utime_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
-#define TIMEOUT_SIGNAL SIGUSR2
-#define SLEEP_TIME 30 /* examine things every 30 seconds */
+/* Locals */
+static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
/* Forward referenced functions */
-static void *watchdog_thread(void *arg);
+extern "C" void *watchdog_thread(void *arg);
+
+static void wd_lock();
+static void wd_unlock();
/* Static globals */
-static pthread_mutex_t mutex;
-static pthread_cond_t timer;
-static int quit;
+static bool quit = false;;
+static bool wd_is_init = false;
+static brwlock_t lock; /* watchdog lock */
+static pthread_t wd_tid;
+static dlist *wd_queue;
+static dlist *wd_inactive;
-/*
- * Timeout signal comes here
+/*
+ * Returns: 0 if the current thread is NOT the watchdog
+ * 1 if the current thread is the watchdog
*/
-static void timeout_handler(int sig)
+bool is_watchdog()
{
- return; /* thus interrupting the function */
+ if (wd_is_init && pthread_equal(pthread_self(), wd_tid)) {
+ return true;
+ } else {
+ return false;
+ }
}
-
-/*
- * Initialize watchdog thread
+/*
+ * Start watchdog thread
*
* Returns: 0 on success
- * errno on failure
+ * errno on failure
*/
-int init_watchdog(void)
+int start_watchdog(void)
{
int stat;
- pthread_t wdid;
- struct sigaction sigtimer;
-
- sigtimer.sa_flags = 0;
- sigtimer.sa_handler = timeout_handler;
- sigfillset(&sigtimer.sa_mask);
- sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
- watchdog_time = time(NULL);
- if ((stat = pthread_mutex_init(&mutex, NULL)) != 0) {
- return stat;
+ watchdog_t *dummy = NULL;
+ int errstat;
+
+ if (wd_is_init) {
+ return 0;
}
- if ((stat = pthread_cond_init(&timer, NULL)) != 0) {
- pthread_mutex_destroy(&mutex);
- return stat;
+ Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
+ watchdog_time = time(NULL);
+
+ if ((errstat=rwl_init(&lock)) != 0) {
+ berrno be;
+ Jmsg1(NULL, M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
+ be.bstrerror(errstat));
}
- quit = FALSE;
- if ((stat = pthread_create(&wdid, NULL, watchdog_thread, (void *)NULL)) != 0) {
- pthread_mutex_destroy(&mutex);
- pthread_cond_destroy(&timer);
+ wd_queue = New(dlist(dummy, &dummy->link));
+ wd_inactive = New(dlist(dummy, &dummy->link));
+ wd_is_init = true;
+
+ if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
return stat;
}
return 0;
}
+/*
+ * Wake watchdog timer thread so that it walks the
+ * queue and adjusts its wait time (or exits).
+ */
+static void ping_watchdog()
+{
+ P(timer_mutex);
+ pthread_cond_signal(&timer);
+ V(timer_mutex);
+ bmicrosleep(0, 100);
+}
+
/*
* Terminate the watchdog thread
*
* Returns: 0 on success
- * errno on failure
+ * errno on failure
*/
-int term_watchdog(void)
+int stop_watchdog(void)
{
int stat;
+ watchdog_t *p;
- if ((stat = pthread_mutex_lock(&mutex)) != 0) {
- return stat;
+ if (!wd_is_init) {
+ return 0;
}
- quit = TRUE;
- if ((stat = pthread_cond_signal(&timer)) != 0) {
- pthread_mutex_unlock(&mutex);
- return stat;
+ quit = true; /* notify watchdog thread to stop */
+ ping_watchdog();
+
+ stat = pthread_join(wd_tid, NULL);
+
+ while (!wd_queue->empty()) {
+ void *item = wd_queue->first();
+ wd_queue->remove(item);
+ p = (watchdog_t *)item;
+ if (p->destructor != NULL) {
+ p->destructor(p);
+ }
+ free(p);
}
- if ((stat = pthread_mutex_unlock(&mutex)) != 0) {
- return stat;
+ delete wd_queue;
+ wd_queue = NULL;
+
+ while (!wd_inactive->empty()) {
+ void *item = wd_inactive->first();
+ wd_inactive->remove(item);
+ p = (watchdog_t *)item;
+ if (p->destructor != NULL) {
+ p->destructor(p);
+ }
+ free(p);
}
- return 0;
-}
+ delete wd_inactive;
+ wd_inactive = NULL;
+ rwl_destroy(&lock);
+ wd_is_init = false;
+ return stat;
+}
-/*
- * This is the actual watchdog thread.
- */
-static void *watchdog_thread(void *arg)
+watchdog_t *new_watchdog(void)
{
- struct timespec timeout;
- int stat;
- JCR *jcr;
- BSOCK *fd;
+ watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
- Dmsg0(200, "Start watchdog thread\n");
- pthread_detach(pthread_self());
+ if (!wd_is_init) {
+ start_watchdog();
+ }
- if ((stat = pthread_mutex_lock(&mutex)) != 0) {
+ if (wd == NULL) {
return NULL;
}
+ wd->one_shot = true;
+ wd->interval = 0;
+ wd->callback = NULL;
+ wd->destructor = NULL;
+ wd->data = NULL;
- for ( ;!quit; ) {
- struct timeval tv;
- struct timezone tz;
+ return wd;
+}
- Dmsg0(200, "Top of for loop\n");
+bool register_watchdog(watchdog_t *wd)
+{
+ if (!wd_is_init) {
+ Jmsg0(NULL, M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
+ }
+ if (wd->callback == NULL) {
+ Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
+ }
+ if (wd->interval == 0) {
+ Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
+ }
+
+ wd_lock();
+ wd->next_fire = watchdog_time + wd->interval;
+ wd_queue->append(wd);
+ Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
+ wd, wd->interval, wd->one_shot ? " one shot" : "");
+ wd_unlock();
+ ping_watchdog();
+
+ return false;
+}
+
+bool unregister_watchdog(watchdog_t *wd)
+{
+ watchdog_t *p;
+ bool ok = false;
- watchdog_time = time(NULL); /* update timer */
+ if (!wd_is_init) {
+ Jmsg0(NULL, M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
+ }
- /* Walk through all JCRs checking if any one is
- * blocked for more than specified max time.
+ wd_lock();
+ foreach_dlist(p, wd_queue) {
+ if (wd == p) {
+ wd_queue->remove(wd);
+ Dmsg1(800, "Unregistered watchdog %p\n", wd);
+ ok = true;
+ goto get_out;
+ }
+ }
+
+ foreach_dlist(p, wd_inactive) {
+ if (wd == p) {
+ wd_inactive->remove(wd);
+ Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
+ ok = true;
+ goto get_out;
+ }
+ }
+
+ Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
+
+get_out:
+ wd_unlock();
+ ping_watchdog();
+ return ok;
+}
+
+/*
+ * This is the thread that walks the watchdog queue
+ * and when a queue item fires, the callback is
+ * invoked. If it is a one shot, the queue item
+ * is moved to the inactive queue.
+ */
+extern "C" void *watchdog_thread(void *arg)
+{
+ struct timespec timeout;
+ struct timeval tv;
+ struct timezone tz;
+ utime_t next_time;
+
+ set_jcr_in_tsd(INVALID_JCR);
+ Dmsg0(800, "NicB-reworked watchdog thread entered\n");
+
+ while (!quit) {
+ watchdog_t *p;
+
+ /*
+ *
+ * NOTE. lock_jcr_chain removed, but the message below
+ * was left until we are sure there are no deadlocks.
+ *
+ * We lock the jcr chain here because a good number of the
+ * callback routines lock the jcr chain. We need to lock
+ * it here *before* the watchdog lock because the SD message
+ * thread first locks the jcr chain, then when closing the
+ * job locks the watchdog chain. If the two threads do not
+ * lock in the same order, we get a deadlock -- each holds
+ * the other's needed lock.
*/
- lock_jcr_chain();
- for (jcr=NULL; (jcr=get_next_jcr(jcr)); ) {
- free_locked_jcr(jcr);
- if (jcr->JobId == 0) {
- continue;
- }
- fd = jcr->store_bsock;
- if (fd && fd->timer_start && (watchdog_time - fd->timer_start) > fd->timeout) {
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, "Watchdog sending kill to thread stalled reading Storage daemon.\n");
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
- fd = jcr->file_bsock;
- if (fd && fd->timer_start && (watchdog_time - fd->timer_start) > fd->timeout) {
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, "Watchdog sending kill to thread stalled reading File daemon.\n");
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
- fd = jcr->dir_bsock;
- if (fd && fd->timer_start && (watchdog_time - fd->timer_start) > fd->timeout) {
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, "Watchdog sending kill to thread stalled reading Director.\n");
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
+ wd_lock();
+walk_list:
+ watchdog_time = time(NULL);
+ next_time = watchdog_time + watchdog_sleep_time;
+ foreach_dlist(p, wd_queue) {
+ if (p->next_fire <= watchdog_time) {
+ /* Run the callback */
+ Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
+ p->callback(p);
+
+ /* Reschedule (or move to inactive list if it's a one-shot timer) */
+ if (p->one_shot) {
+ wd_queue->remove(p);
+ wd_inactive->append(p);
+ goto walk_list;
+ } else {
+ p->next_fire = watchdog_time + p->interval;
+ }
+ }
+ if (p->next_fire <= next_time) {
+ next_time = p->next_fire;
+ }
}
- unlock_jcr_chain();
+ wd_unlock();
- gettimeofday(&tv, &tz);
- timeout.tv_nsec = 0;
- timeout.tv_sec = tv.tv_sec + SLEEP_TIME;
-
- Dmsg1(200, "pthread_cond_timedwait sec=%d\n", timeout.tv_sec);
-#ifdef xxxxxxxxxxxxxxx_was_HAVE_CYGWIN
- /* CYGWIN dies with a page fault the second
- * time that pthread_cond_timedwait() is called
- * so fake it out.
+ /*
+ * Wait sleep time or until someone wakes us
*/
- sleep(SLEEP_TIME);
-#else
- stat = pthread_cond_timedwait(&timer, &mutex, &timeout);
- Dmsg1(200, "pthread_cond_timedwait stat=%d\n", stat);
-#endif
-
- } /* end of big for loop */
-
- Dmsg0(200, "End watchdog\n");
+ gettimeofday(&tv, &tz);
+ timeout.tv_nsec = tv.tv_usec * 1000;
+ timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
+ while (timeout.tv_nsec >= 1000000000) {
+ timeout.tv_nsec -= 1000000000;
+ timeout.tv_sec++;
+ }
+
+ Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
+ /* Note, this unlocks mutex during the sleep */
+ P(timer_mutex);
+ pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
+ V(timer_mutex);
+ }
+
+ Dmsg0(800, "NicB-reworked watchdog thread exited\n");
return NULL;
}
+
+/*
+ * Watchdog lock, this can be called multiple times by the same
+ * thread without blocking, but must be unlocked the number of
+ * times it was locked.
+ */
+static void wd_lock()
+{
+ int errstat;
+ if ((errstat=rwl_writelock(&lock)) != 0) {
+ berrno be;
+ Jmsg1(NULL, M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
+ be.bstrerror(errstat));
+ }
+}
+
+/*
+ * Unlock the watchdog. This can be called multiple times by the
+ * same thread up to the number of times that thread called
+ * wd_ lock()/
+ */
+static void wd_unlock()
+{
+ int errstat;
+ if ((errstat=rwl_writeunlock(&lock)) != 0) {
+ berrno be;
+ Jmsg1(NULL, M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),
+ be.bstrerror(errstat));
+ }
+}