*
*/
/*
- Copyright (C) 2000-2003 Kern Sibbald and John Walker
+ Copyright (C) 2000-2004 Kern Sibbald and John Walker
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
#include "jcr.h"
/* Exported globals */
-time_t watchdog_time; /* this has granularity of SLEEP_TIME */
+time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
-#define SLEEP_TIME 30 /* examine things every 30 seconds */
+#define SLEEP_TIME 1 /* examine things every second */
/* Forward referenced functions */
-static void *btimer_thread(void *arg);
-static void stop_btimer(btimer_id wid);
-static btimer_id btimer_start_common(uint32_t wait);
+static void *watchdog_thread(void *arg);
+static void wd_lock();
+static void wd_unlock();
/* Static globals */
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
-static int quit;
-static btimer_t *timer_chain = NULL;
-
-
-/*
- * Timeout signal comes here
- */
-void timeout_handler(int sig)
-{
- return; /* thus interrupting the function */
-}
+static bool quit = false;;
+static bool wd_is_init = false;
+static brwlock_t lock; /* watchdog lock */
+static pthread_t wd_tid;
+static dlist *wd_queue;
+static dlist *wd_inactive;
/*
* Start watchdog thread
int start_watchdog(void)
{
int stat;
- pthread_t wdid;
- struct sigaction sigtimer;
-
- sigtimer.sa_flags = 0;
- sigtimer.sa_handler = timeout_handler;
- sigfillset(&sigtimer.sa_mask);
- sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
+ watchdog_t *dummy = NULL;
+ int errstat;
+
+ if (wd_is_init) {
+ return 0;
+ }
+ Dmsg0(200, "Initialising NicB-hacked watchdog thread\n");
watchdog_time = time(NULL);
- quit = FALSE;
- if ((stat = pthread_create(&wdid, NULL, btimer_thread, (void *)NULL)) != 0) {
+
+ if ((errstat=rwl_init(&lock)) != 0) {
+ Emsg1(M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
+ strerror(errstat));
+ }
+ wd_queue = new dlist(wd_queue, &dummy->link);
+ wd_inactive = new dlist(wd_inactive, &dummy->link);
+
+ if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
return stat;
}
+ wd_is_init = true;
return 0;
}
int stop_watchdog(void)
{
int stat;
+ watchdog_t *p;
- quit = TRUE;
- P(mutex);
- if ((stat = pthread_cond_signal(&timer)) != 0) {
- V(mutex);
- return stat;
+ if (!wd_is_init) {
+ return 0;
}
- V(mutex);
- return 0;
-}
+ quit = true; /* notify watchdog thread to stop */
+ wd_is_init = false;
-/*
- * This is the actual watchdog thread.
- */
-static void *btimer_thread(void *arg)
-{
- JCR *jcr;
- BSOCK *fd;
- btimer_t *wid;
-
- Dmsg0(200, "Start watchdog thread\n");
- pthread_detach(pthread_self());
+ stat = pthread_join(wd_tid, NULL);
- for ( ;!quit; ) {
- time_t timer_start, now;
+ foreach_dlist(p, wd_queue) {
+ if (p->destructor != NULL) {
+ p->destructor(p);
+ }
+ free(p);
+ }
+ delete wd_queue;
+ wd_queue = NULL;
- Dmsg0(200, "Top of watchdog loop\n");
+ foreach_dlist(p, wd_inactive) {
+ if (p->destructor != NULL) {
+ p->destructor(p);
+ }
+ free(p);
+ }
- watchdog_time = time(NULL); /* update timer */
+ delete wd_inactive;
+ wd_inactive = NULL;
+ rwl_destroy(&lock);
- /* Walk through all JCRs checking if any one is
- * blocked for more than specified max time.
- */
- lock_jcr_chain();
- for (jcr=NULL; (jcr=get_next_jcr(jcr)); ) {
- free_locked_jcr(jcr);
- if (jcr->JobId == 0) {
- continue;
- }
- fd = jcr->store_bsock;
- if (fd) {
- timer_start = fd->timer_start;
- if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
- fd->timer_start = 0; /* turn off timer */
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, _(
-"Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
- watchdog_time - timer_start);
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
- }
- fd = jcr->file_bsock;
- if (fd) {
- timer_start = fd->timer_start;
- if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
- fd->timer_start = 0; /* turn off timer */
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, _(
-"Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
- watchdog_time - timer_start);
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
- }
- fd = jcr->dir_bsock;
- if (fd) {
- timer_start = fd->timer_start;
- if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
- fd->timer_start = 0; /* turn off timer */
- fd->timed_out = TRUE;
- Jmsg(jcr, M_ERROR, 0, _(
-"Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
- watchdog_time - timer_start);
- pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
- }
- }
+ return stat;
+}
- }
- unlock_jcr_chain();
+watchdog_t *new_watchdog(void)
+{
+ watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
- Dmsg0(200, "Watchdog sleep.\n");
- bmicrosleep(SLEEP_TIME, 0);
- now = time(NULL);
+ if (!wd_is_init) {
+ start_watchdog();
+ }
- /*
- * Now handle child and thread timers set by the code.
- */
- /* Walk child chain killing off any process overdue */
- P(mutex);
- for (wid = timer_chain; wid; wid=wid->next) {
- int killed = FALSE;
- /* First ask him politely to go away */
- if (!wid->killed && now > (wid->start_time + wid->wait)) {
-// Dmsg1(000, "Watchdog sigterm pid=%d\n", wid->pid);
- if (wid->type == TYPE_CHILD) {
- kill(wid->pid, SIGTERM);
- killed = TRUE;
- } else {
- Dmsg1(200, "watchdog kill thread %d\n", wid->tid);
- pthread_kill(wid->tid, TIMEOUT_SIGNAL);
- wid->killed = TRUE;
- }
- }
- /* If we asked a child to die, wait 3 seconds and slam him */
- if (killed) {
- btimer_t *wid1;
- bmicrosleep(3, 0);
- for (wid1 = timer_chain; wid1; wid1=wid1->next) {
- if (wid->type == TYPE_CHILD &&
- !wid1->killed && now > (wid1->start_time + wid1->wait)) {
- kill(wid1->pid, SIGKILL);
-// Dmsg1(000, "Watchdog killed pid=%d\n", wid->pid);
- wid1->killed = TRUE;
- }
- }
- }
- }
- V(mutex);
- } /* end of big for loop */
+ if (wd == NULL) {
+ return NULL;
+ }
+ wd->one_shot = true;
+ wd->interval = 0;
+ wd->callback = NULL;
+ wd->destructor = NULL;
+ wd->data = NULL;
- Dmsg0(200, "End watchdog\n");
- return NULL;
+ return wd;
}
-/*
- * Start a timer on a child process of pid, kill it after wait seconds.
- * NOTE! Granularity is SLEEP_TIME (i.e. 30 seconds)
- *
- * Returns: btimer_id (pointer to btimer_t struct) on success
- * NULL on failure
- */
-btimer_id start_child_timer(pid_t pid, uint32_t wait)
+bool register_watchdog(watchdog_t *wd)
{
- btimer_t *wid;
- wid = btimer_start_common(wait);
- wid->pid = pid;
- wid->type = TYPE_CHILD;
- Dmsg2(200, "Start child timer 0x%x for %d secs.\n", wid, wait);
- return wid;
+ if (!wd_is_init) {
+ Emsg0(M_ABORT, 0, "BUG! register_watchdog called before start_watchdog\n");
+ }
+ if (wd->callback == NULL) {
+ Emsg1(M_ABORT, 0, "BUG! Watchdog %p has NULL callback\n", wd);
+ }
+ if (wd->interval == 0) {
+ Emsg1(M_ABORT, 0, "BUG! Watchdog %p has zero interval\n", wd);
+ }
+
+ wd_lock();
+ wd->next_fire = watchdog_time + wd->interval;
+ wd_queue->append(wd);
+ Dmsg3(200, "Registered watchdog %p, interval %d%s\n",
+ wd, wd->interval, wd->one_shot ? " one shot" : "");
+ wd_unlock();
+
+ return false;
}
-/*
- * Start a timer on a thread. kill it after wait seconds.
- * NOTE! Granularity is SLEEP_TIME (i.e. 30 seconds)
- *
- * Returns: btimer_id (pointer to btimer_t struct) on success
- * NULL on failure
- */
-btimer_id start_thread_timer(pthread_t tid, uint32_t wait)
+bool unregister_watchdog_unlocked(watchdog_t *wd)
{
- btimer_t *wid;
- wid = btimer_start_common(wait);
- wid->tid = tid;
- wid->type = TYPE_PTHREAD;
- Dmsg2(200, "Start thread timer 0x%x for %d secs.\n", wid, wait);
- return wid;
+ watchdog_t *p;
+
+ if (!wd_is_init) {
+ Emsg0(M_ABORT, 0, "BUG! unregister_watchdog_unlocked called before start_watchdog\n");
+ }
+
+ foreach_dlist(p, wd_queue) {
+ if (wd == p) {
+ wd_queue->remove(wd);
+ Dmsg1(200, "Unregistered watchdog %p\n", wd);
+ return true;
+ }
+ }
+
+ foreach_dlist(p, wd_inactive) {
+ if (wd == p) {
+ wd_inactive->remove(wd);
+ Dmsg1(200, "Unregistered inactive watchdog %p\n", wd);
+ return true;
+ }
+ }
+
+ Dmsg1(200, "Failed to unregister watchdog %p\n", wd);
+ return false;
}
-static btimer_id btimer_start_common(uint32_t wait)
+bool unregister_watchdog(watchdog_t *wd)
{
- btimer_id wid = (btimer_id)malloc(sizeof(btimer_t));
-
- P(mutex);
- /* Chain it into timer_chain as the first item */
- wid->prev = NULL;
- wid->next = timer_chain;
- if (timer_chain) {
- timer_chain->prev = wid;
+ bool ret;
+
+ if (!wd_is_init) {
+ Emsg0(M_ABORT, 0, "BUG! unregister_watchdog called before start_watchdog\n");
}
- timer_chain = wid;
- wid->start_time = time(NULL);
- wid->wait = wait;
- wid->killed = FALSE;
- V(mutex);
- return wid;
+
+ wd_lock();
+ ret = unregister_watchdog_unlocked(wd);
+ wd_unlock();
+
+ return ret;
}
-/*
- * Stop child timer
- */
-void stop_child_timer(btimer_id wid)
+static void *watchdog_thread(void *arg)
{
- Dmsg2(200, "Stop child timer 0x%x for %d secs.\n", wid, wid->wait);
- stop_btimer(wid);
+ Dmsg0(200, "NicB-reworked watchdog thread entered\n");
+
+ while (!quit) {
+ watchdog_t *p;
+
+ /*
+ * We lock the jcr chain here because a good number of the
+ * callback routines lock the jcr chain. We need to lock
+ * it here *before* the watchdog lock because the SD message
+ * thread first locks the jcr chain, then when closing the
+ * job locks the watchdog chain. If the two thread do not
+ * lock in the same order, we get a deadlock -- each holds
+ * the other's needed lock.
+ */
+ lock_jcr_chain();
+ wd_lock();
+ watchdog_time = time(NULL);
+
+ foreach_dlist(p, wd_queue) {
+ if (p->next_fire < watchdog_time) {
+ /* Run the callback */
+ p->callback(p);
+
+ /* Reschedule (or move to inactive list if it's a one-shot timer) */
+ if (p->one_shot) {
+ wd_queue->remove(p);
+ wd_inactive->append(p);
+ } else {
+ p->next_fire = watchdog_time + p->interval;
+ }
+ }
+ }
+ wd_unlock();
+ unlock_jcr_chain();
+ bmicrosleep(SLEEP_TIME, 0);
+ }
+
+ Dmsg0(200, "NicB-reworked watchdog thread exited\n");
+ return NULL;
}
/*
- * Stop thread timer
+ * Watchdog lock, this can be called multiple times by the same
+ * thread without blocking, but must be unlocked the number of
+ * times it was locked.
*/
-void stop_thread_timer(btimer_id wid)
+static void wd_lock()
{
- if (!wid) {
- return;
+ int errstat;
+ if ((errstat=rwl_writelock(&lock)) != 0) {
+ Emsg1(M_ABORT, 0, "rwl_writelock failure. ERR=%s\n",
+ strerror(errstat));
}
- Dmsg2(200, "Stop thread timer 0x%x for %d secs.\n", wid, wid->wait);
- stop_btimer(wid);
-}
-
+}
/*
- * Stop btimer
+ * Unlock the watchdog. This can be called multiple times by the
+ * same thread up to the number of times that thread called
+ * wd_ lock()/
*/
-static void stop_btimer(btimer_id wid)
+static void wd_unlock()
{
- if (wid == NULL) {
- Emsg0(M_ABORT, 0, _("NULL btimer_id.\n"));
- }
- P(mutex);
- /* Remove wid from timer_chain */
- if (!wid->prev) { /* if no prev */
- timer_chain = wid->next; /* set new head */
- } else {
- wid->prev->next = wid->next; /* update prev */
+ int errstat;
+ if ((errstat=rwl_writeunlock(&lock)) != 0) {
+ Emsg1(M_ABORT, 0, "rwl_writeunlock failure. ERR=%s\n",
+ strerror(errstat));
}
- if (wid->next) {
- wid->next->prev = wid->prev; /* unlink it */
- }
- V(mutex);
- free(wid);
-}
+}