2 * Bacula thread watchdog routine. General routine that monitors
3 * the daemon and signals a thread if it is blocked on a BSOCK
4 * too long. This prevents catastropic long waits -- generally
5 * due to Windows "hanging" the app.
7 * Kern Sibbald, January MMII
11 Copyright (C) 2000-2003 Kern Sibbald and John Walker
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of
16 the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public
24 License along with this program; if not, write to the Free
25 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
33 /* Exported globals */
34 time_t watchdog_time; /* this has granularity of SLEEP_TIME */
37 #define TIMEOUT_SIGNAL SIGUSR2
38 #define SLEEP_TIME 30 /* examine things every 30 seconds */
40 /* Forward referenced functions */
41 static void *btimer_thread(void *arg);
42 static void stop_btimer(btimer_id wid);
43 static btimer_id btimer_start_common(uint32_t wait);
46 static pthread_mutex_t mutex;
47 static pthread_cond_t timer;
49 static btimer_t *timer_chain = NULL;
53 * Timeout signal comes here
55 static void timeout_handler(int sig)
57 return; /* thus interrupting the function */
62 * Start watchdog thread
64 * Returns: 0 on success
67 int start_watchdog(void)
71 struct sigaction sigtimer;
73 sigtimer.sa_flags = 0;
74 sigtimer.sa_handler = timeout_handler;
75 sigfillset(&sigtimer.sa_mask);
76 sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
77 watchdog_time = time(NULL);
78 if ((stat = pthread_mutex_init(&mutex, NULL)) != 0) {
81 if ((stat = pthread_cond_init(&timer, NULL)) != 0) {
82 pthread_mutex_destroy(&mutex);
86 if ((stat = pthread_create(&wdid, NULL, btimer_thread, (void *)NULL)) != 0) {
87 pthread_mutex_destroy(&mutex);
88 pthread_cond_destroy(&timer);
95 * Terminate the watchdog thread
97 * Returns: 0 on success
100 int stop_watchdog(void)
106 if ((stat = pthread_cond_signal(&timer)) != 0) {
116 * This is the actual watchdog thread.
118 static void *btimer_thread(void *arg)
120 struct timespec timeout;
126 Dmsg0(200, "Start watchdog thread\n");
127 pthread_detach(pthread_self());
133 time_t timer_start, now;
135 Dmsg0(200, "Top of for loop\n");
137 watchdog_time = time(NULL); /* update timer */
139 /* Walk through all JCRs checking if any one is
140 * blocked for more than specified max time.
143 for (jcr=NULL; (jcr=get_next_jcr(jcr)); ) {
144 free_locked_jcr(jcr);
145 if (jcr->JobId == 0) {
148 fd = jcr->store_bsock;
150 timer_start = fd->timer_start;
151 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
152 fd->timed_out = TRUE;
153 Jmsg(jcr, M_ERROR, 0, _(
154 "Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
155 watchdog_time - timer_start);
156 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
159 fd = jcr->file_bsock;
161 timer_start = fd->timer_start;
162 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
163 fd->timed_out = TRUE;
164 Jmsg(jcr, M_ERROR, 0, _(
165 "Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
166 watchdog_time - timer_start);
167 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
172 timer_start = fd->timer_start;
173 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
174 fd->timed_out = TRUE;
175 Jmsg(jcr, M_ERROR, 0, _(
176 "Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
177 watchdog_time - timer_start);
178 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
185 gettimeofday(&tv, &tz);
187 timeout.tv_sec = tv.tv_sec + SLEEP_TIME;
189 Dmsg1(200, "pthread_cond_timedwait sec=%d\n", timeout.tv_sec);
190 /* Note, this unlocks mutex during the sleep */
191 stat = pthread_cond_timedwait(&timer, &mutex, &timeout);
192 Dmsg1(200, "pthread_cond_timedwait stat=%d\n", stat);
196 /* Walk child chain killing off any process overdue */
197 for (wid = timer_chain; wid; wid=wid->next) {
199 /* First ask him politely to go away */
200 if (!wid->killed && now > (wid->start_time + wid->wait)) {
201 // Dmsg1(000, "Watchdog sigterm pid=%d\n", wid->pid);
202 if (wid->type == TYPE_CHILD) {
203 kill(wid->pid, SIGTERM);
206 Dmsg1(200, "watchdog kill thread %d\n", wid->tid);
207 pthread_kill(wid->tid, TIMEOUT_SIGNAL);
211 /* If we asked a child to die, wait 3 seconds and slam him */
215 for (wid1 = timer_chain; wid1; wid1=wid1->next) {
216 if (wid->type == TYPE_CHILD &&
217 !wid1->killed && now > (wid1->start_time + wid1->wait)) {
218 kill(wid1->pid, SIGKILL);
219 // Dmsg1(000, "Watchdog killed pid=%d\n", wid->pid);
226 } /* end of big for loop */
229 Dmsg0(200, "End watchdog\n");
234 * Start a timer on a child process of pid, kill it after wait seconds.
235 * NOTE! Granularity is SLEEP_TIME (i.e. 30 seconds)
237 * Returns: btimer_id (pointer to btimer_t struct) on success
240 btimer_id start_child_timer(pid_t pid, uint32_t wait)
243 wid = btimer_start_common(wait);
245 wid->type = TYPE_CHILD;
246 Dmsg2(200, "Start child timer 0x%x for %d secs.\n", wid, wait);
251 * Start a timer on a thread. kill it after wait seconds.
252 * NOTE! Granularity is SLEEP_TIME (i.e. 30 seconds)
254 * Returns: btimer_id (pointer to btimer_t struct) on success
257 btimer_id start_thread_timer(pthread_t tid, uint32_t wait)
260 wid = btimer_start_common(wait);
262 wid->type = TYPE_PTHREAD;
263 Dmsg2(200, "Start thread timer 0x%x for %d secs.\n", wid, wait);
267 static btimer_id btimer_start_common(uint32_t wait)
269 btimer_id wid = (btimer_id)malloc(sizeof(btimer_t));
272 /* Chain it into timer_chain as the first item */
274 wid->next = timer_chain;
276 timer_chain->prev = wid;
279 wid->start_time = time(NULL);
289 void stop_child_timer(btimer_id wid)
291 Dmsg2(200, "Stop child timer 0x%x for %d secs.\n", wid, wid->wait);
298 void stop_thread_timer(btimer_id wid)
303 Dmsg2(200, "Stop thread timer 0x%x for %d secs.\n", wid, wid->wait);
311 static void stop_btimer(btimer_id wid)
314 Emsg0(M_ABORT, 0, _("NULL btimer_id.\n"));
317 /* Remove wid from timer_chain */
318 if (!wid->prev) { /* if no prev */
319 timer_chain = wid->next; /* set new head */
321 wid->prev->next = wid->next; /* update prev */
324 wid->next->prev = wid->prev; /* unlink it */