2 * Bacula thread watchdog routine. General routine that monitors
3 * the daemon and signals a thread if it is blocked on a BSOCK
4 * too long. This prevents catastropic long waits -- generally
5 * due to Windows "hanging" the app.
7 * Kern Sibbald, January MMII
11 Copyright (C) 2000, 2001, 2002 Kern Sibbald and John Walker
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of
16 the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public
24 License along with this program; if not, write to the Free
25 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
33 /* Exported globals */
34 time_t watchdog_time; /* this has granularity of SLEEP_TIME */
37 #define TIMEOUT_SIGNAL SIGUSR2
38 #define SLEEP_TIME 30 /* examine things every 30 seconds */
40 /* Forward referenced functions */
41 static void *watchdog_thread(void *arg);
44 static pthread_mutex_t mutex;
45 static pthread_cond_t timer;
50 * Timeout signal comes here
52 static void timeout_handler(int sig)
54 return; /* thus interrupting the function */
59 * Start watchdog thread
61 * Returns: 0 on success
64 int start_watchdog(void)
68 struct sigaction sigtimer;
70 sigtimer.sa_flags = 0;
71 sigtimer.sa_handler = timeout_handler;
72 sigfillset(&sigtimer.sa_mask);
73 sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
74 watchdog_time = time(NULL);
75 if ((stat = pthread_mutex_init(&mutex, NULL)) != 0) {
78 if ((stat = pthread_cond_init(&timer, NULL)) != 0) {
79 pthread_mutex_destroy(&mutex);
83 if ((stat = pthread_create(&wdid, NULL, watchdog_thread, (void *)NULL)) != 0) {
84 pthread_mutex_destroy(&mutex);
85 pthread_cond_destroy(&timer);
92 * Terminate the watchdog thread
94 * Returns: 0 on success
97 int stop_watchdog(void)
101 if ((stat = pthread_mutex_lock(&mutex)) != 0) {
106 if ((stat = pthread_cond_signal(&timer)) != 0) {
107 pthread_mutex_unlock(&mutex);
110 if ((stat = pthread_mutex_unlock(&mutex)) != 0) {
118 * This is the actual watchdog thread.
120 static void *watchdog_thread(void *arg)
122 struct timespec timeout;
127 Dmsg0(200, "Start watchdog thread\n");
128 pthread_detach(pthread_self());
130 if ((stat = pthread_mutex_lock(&mutex)) != 0) {
139 Dmsg0(200, "Top of for loop\n");
141 watchdog_time = time(NULL); /* update timer */
143 /* Walk through all JCRs checking if any one is
144 * blocked for more than specified max time.
147 for (jcr=NULL; (jcr=get_next_jcr(jcr)); ) {
148 free_locked_jcr(jcr);
149 if (jcr->JobId == 0) {
152 fd = jcr->store_bsock;
154 timer_start = fd->timer_start;
155 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
156 fd->timed_out = TRUE;
157 Jmsg(jcr, M_ERROR, 0, _(
158 "Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
159 watchdog_time - timer_start);
160 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
163 fd = jcr->file_bsock;
165 timer_start = fd->timer_start;
166 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
167 fd->timed_out = TRUE;
168 Jmsg(jcr, M_ERROR, 0, _(
169 "Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
170 watchdog_time - timer_start);
171 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
176 timer_start = fd->timer_start;
177 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
178 fd->timed_out = TRUE;
179 Jmsg(jcr, M_ERROR, 0, _(
180 "Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
181 watchdog_time - timer_start);
182 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
189 gettimeofday(&tv, &tz);
191 timeout.tv_sec = tv.tv_sec + SLEEP_TIME;
193 Dmsg1(200, "pthread_cond_timedwait sec=%d\n", timeout.tv_sec);
194 stat = pthread_cond_timedwait(&timer, &mutex, &timeout);
195 Dmsg1(200, "pthread_cond_timedwait stat=%d\n", stat);
197 } /* end of big for loop */
199 pthread_mutex_unlock(&mutex); /* for good form */
200 Dmsg0(200, "End watchdog\n");