]> git.sur5r.net Git - bacula/bacula/blobdiff - bacula/src/lib/watchdog.c
This commit was manufactured by cvs2svn to create tag
[bacula/bacula] / bacula / src / lib / watchdog.c
index 7d345d5fe8fd8698d27e2e51a9e1eab0bccce99f..14fda2273b566fd3b420e4bbe555b994509f5568 100755 (executable)
@@ -8,7 +8,7 @@
  *
  */
 /*
-   Copyright (C) 2000, 2001, 2002 Kern Sibbald and John Walker
+   Copyright (C) 2000-2003 Kern Sibbald and John Walker
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
 /* Exported globals */
 time_t watchdog_time;                /* this has granularity of SLEEP_TIME */
 
-
-#define TIMEOUT_SIGNAL SIGUSR2
 #define SLEEP_TIME 30                /* examine things every 30 seconds */
 
 /* Forward referenced functions */
-static void *watchdog_thread(void *arg);
+static void *btimer_thread(void *arg);
+static void stop_btimer(btimer_id wid);
+static btimer_id btimer_start_common(uint32_t wait);
 
 /* Static globals */
-static pthread_mutex_t mutex;
-static pthread_cond_t  timer;
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t  timer = PTHREAD_COND_INITIALIZER;
 static int quit;
+static btimer_t *timer_chain = NULL;
 
 
 /*
  * Timeout signal comes here
  */
-static void timeout_handler(int sig)
+void timeout_handler(int sig)
 {
    return;                           /* thus interrupting the function */
 }
@@ -72,17 +73,8 @@ int start_watchdog(void)
    sigfillset(&sigtimer.sa_mask);
    sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
    watchdog_time = time(NULL);
-   if ((stat = pthread_mutex_init(&mutex, NULL)) != 0) {
-      return stat;
-   }
-   if ((stat = pthread_cond_init(&timer, NULL)) != 0) {
-      pthread_mutex_destroy(&mutex);
-      return stat;
-   }
    quit = FALSE;
-   if ((stat = pthread_create(&wdid, NULL, watchdog_thread, (void *)NULL)) != 0) {
-      pthread_mutex_destroy(&mutex);
-      pthread_cond_destroy(&timer);
+   if ((stat = pthread_create(&wdid, NULL, btimer_thread, (void *)NULL)) != 0) {
       return stat;
    }
    return 0;
@@ -98,18 +90,13 @@ int stop_watchdog(void)
 {
    int stat;
 
-   if ((stat = pthread_mutex_lock(&mutex)) != 0) {
-      return stat;
-   }
    quit = TRUE;
-
+   P(mutex);
    if ((stat = pthread_cond_signal(&timer)) != 0) {
-      pthread_mutex_unlock(&mutex);
-      return stat;
-   }
-   if ((stat = pthread_mutex_unlock(&mutex)) != 0) {
+      V(mutex);
       return stat;
    }
+   V(mutex);
    return 0;
 }
 
@@ -117,26 +104,19 @@ int stop_watchdog(void)
 /* 
  * This is the actual watchdog thread.
  */
-static void *watchdog_thread(void *arg)
+static void *btimer_thread(void *arg)
 {
-   struct timespec timeout;
-   int stat;
    JCR *jcr;
    BSOCK *fd;
+   btimer_t *wid;
 
    Dmsg0(200, "Start watchdog thread\n");
    pthread_detach(pthread_self());
 
-   if ((stat = pthread_mutex_lock(&mutex)) != 0) {
-      return NULL;
-   }
-
    for ( ;!quit; ) {
-      struct timeval tv;
-      struct timezone tz;
-      time_t timer_start;
+      time_t timer_start, now;
 
-      Dmsg0(200, "Top of for loop\n");
+      Dmsg0(200, "Top of watchdog loop\n");
 
       watchdog_time = time(NULL);     /* update timer */
 
@@ -150,54 +130,182 @@ static void *watchdog_thread(void *arg)
            continue;
         }
         fd = jcr->store_bsock;
-        timer_start = fd->timer_start;
-        if (fd && timer_start && (watchdog_time - timer_start) > fd->timeout) {
-           fd->timed_out = TRUE;
-           Jmsg(jcr, M_ERROR, 0, _(
+        if (fd) {
+           timer_start = fd->timer_start;
+           if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
+              fd->timer_start = 0;   /* turn off timer */
+              fd->timed_out = TRUE;
+              Jmsg(jcr, M_ERROR, 0, _(
 "Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
-                watchdog_time - timer_start);
-           pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+                   watchdog_time - timer_start);
+              pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+           }
         }
         fd = jcr->file_bsock;
-        timer_start = fd->timer_start;
-        if (fd && timer_start && (watchdog_time - timer_start) > fd->timeout) {
-           fd->timed_out = TRUE;
-           Jmsg(jcr, M_ERROR, 0, _(
+        if (fd) {
+           timer_start = fd->timer_start;
+           if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
+              fd->timer_start = 0;   /* turn off timer */
+              fd->timed_out = TRUE;
+              Jmsg(jcr, M_ERROR, 0, _(
 "Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
-                watchdog_time - timer_start);
-           pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+                   watchdog_time - timer_start);
+              pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+           }
         }
         fd = jcr->dir_bsock;
-        timer_start = fd->timer_start;
-        if (fd && timer_start && (watchdog_time - timer_start) > fd->timeout) {
-           fd->timed_out = TRUE;
-           Jmsg(jcr, M_ERROR, 0, _(
+        if (fd) {
+           timer_start = fd->timer_start;
+           if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
+              fd->timer_start = 0;   /* turn off timer */
+              fd->timed_out = TRUE;
+              Jmsg(jcr, M_ERROR, 0, _(
 "Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
-                watchdog_time - timer_start);
-           pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+                   watchdog_time - timer_start);
+              pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+           }
         }
 
       }
       unlock_jcr_chain();
 
-      gettimeofday(&tv, &tz);
-      timeout.tv_nsec = 0;
-      timeout.tv_sec = tv.tv_sec + SLEEP_TIME;
+      Dmsg0(200, "Watchdog sleep.\n");
+      bmicrosleep(SLEEP_TIME, 0);
+      now = time(NULL);
 
-      Dmsg1(200, "pthread_cond_timedwait sec=%d\n", timeout.tv_sec);
-#ifdef xxxxxxxxxxxxxxx_was_HAVE_CYGWIN
-      /* CYGWIN dies with a page fault the second
-       * time that pthread_cond_timedwait() is called
-       * so fake it out.
+      /* 
+       * Now handle child and thread timers set by the code.
        */
-      sleep(SLEEP_TIME); 
-#else
-      stat = pthread_cond_timedwait(&timer, &mutex, &timeout);
-      Dmsg1(200, "pthread_cond_timedwait stat=%d\n", stat);
-#endif
-      
+      /* Walk child chain killing off any process overdue */
+      P(mutex);
+      for (wid = timer_chain; wid; wid=wid->next) {
+        int killed = FALSE;
+        /* First ask him politely to go away */
+        if (!wid->killed && now > (wid->start_time + wid->wait)) {
+//          Dmsg1(000, "Watchdog sigterm pid=%d\n", wid->pid);
+           if (wid->type == TYPE_CHILD) {
+              kill(wid->pid, SIGTERM);
+              killed = TRUE;
+           } else {
+               Dmsg1(200, "watchdog kill thread %d\n", wid->tid);
+              pthread_kill(wid->tid, TIMEOUT_SIGNAL);
+              wid->killed = TRUE;
+           }
+        }
+        /* If we asked a child to die, wait 3 seconds and slam him */
+        if (killed) {
+           btimer_t *wid1;
+           bmicrosleep(3, 0);
+           for (wid1 = timer_chain; wid1; wid1=wid1->next) {
+              if (wid->type == TYPE_CHILD &&
+                  !wid1->killed && now > (wid1->start_time + wid1->wait)) {
+                 kill(wid1->pid, SIGKILL);
+//                Dmsg1(000, "Watchdog killed pid=%d\n", wid->pid);
+                 wid1->killed = TRUE;
+              }
+           }
+        }
+      }
+      V(mutex);
    } /* end of big for loop */
 
    Dmsg0(200, "End watchdog\n");
    return NULL;
 }
+
+/* 
+ * Start a timer on a child process of pid, kill it after wait seconds.
+ *   NOTE!  Granularity is SLEEP_TIME (i.e. 30 seconds)
+ *
+ *  Returns: btimer_id (pointer to btimer_t struct) on success
+ *          NULL on failure
+ */
+btimer_id start_child_timer(pid_t pid, uint32_t wait)
+{
+   btimer_t *wid;
+   wid = btimer_start_common(wait);
+   wid->pid = pid;
+   wid->type = TYPE_CHILD;
+   Dmsg2(200, "Start child timer 0x%x for %d secs.\n", wid, wait);
+   return wid;
+}
+
+/* 
+ * Start a timer on a thread. kill it after wait seconds.
+ *   NOTE!  Granularity is SLEEP_TIME (i.e. 30 seconds)
+ *
+ *  Returns: btimer_id (pointer to btimer_t struct) on success
+ *          NULL on failure
+ */
+btimer_id start_thread_timer(pthread_t tid, uint32_t wait)
+{
+   btimer_t *wid;
+   wid = btimer_start_common(wait);
+   wid->tid = tid;
+   wid->type = TYPE_PTHREAD;
+   Dmsg2(200, "Start thread timer 0x%x for %d secs.\n", wid, wait);
+   return wid;
+}
+
+static btimer_id btimer_start_common(uint32_t wait)
+{
+   btimer_id wid = (btimer_id)malloc(sizeof(btimer_t));
+
+   P(mutex);
+   /* Chain it into timer_chain as the first item */
+   wid->prev = NULL;
+   wid->next = timer_chain;
+   if (timer_chain) {
+      timer_chain->prev = wid;
+   }
+   timer_chain = wid;
+   wid->start_time = time(NULL);
+   wid->wait = wait;
+   wid->killed = FALSE;
+   V(mutex);
+   return wid;
+}
+
+/*
+ * Stop child timer
+ */
+void stop_child_timer(btimer_id wid)
+{
+   Dmsg2(200, "Stop child timer 0x%x for %d secs.\n", wid, wid->wait);
+   stop_btimer(wid);        
+}
+
+/*
+ * Stop thread timer
+ */
+void stop_thread_timer(btimer_id wid)
+{
+   if (!wid) {
+      return;
+   }
+   Dmsg2(200, "Stop thread timer 0x%x for %d secs.\n", wid, wid->wait);
+   stop_btimer(wid);        
+}
+
+
+/*
+ * Stop btimer
+ */
+static void stop_btimer(btimer_id wid)
+{
+   if (wid == NULL) {
+      Emsg0(M_ABORT, 0, _("NULL btimer_id.\n"));
+   }
+   P(mutex);
+   /* Remove wid from timer_chain */
+   if (!wid->prev) {                 /* if no prev */
+      timer_chain = wid->next;       /* set new head */
+   } else {
+      wid->prev->next = wid->next;    /* update prev */
+   }
+   if (wid->next) {
+      wid->next->prev = wid->prev;    /* unlink it */
+   }
+   V(mutex);
+   free(wid);
+}