2 * Bacula thread watchdog routine. General routine that monitors
3 * the daemon and signals a thread if it is blocked on a BSOCK
4 * too long. This prevents catastropic long waits -- generally
5 * due to Windows "hanging" the app.
7 * Kern Sibbald, January MMII
11 Copyright (C) 2000-2004 Kern Sibbald and John Walker
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of
16 the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public
24 License along with this program; if not, write to the Free
25 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
33 /* Exported globals */
34 time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
35 time_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
38 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
39 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
41 /* Forward referenced functions */
42 extern "C" void *watchdog_thread(void *arg);
44 static void wd_lock();
45 static void wd_unlock();
48 static bool quit = false;;
49 static bool wd_is_init = false;
50 static brwlock_t lock; /* watchdog lock */
52 static pthread_t wd_tid;
53 static dlist *wd_queue;
54 static dlist *wd_inactive;
57 * Start watchdog thread
59 * Returns: 0 on success
62 int start_watchdog(void)
65 watchdog_t *dummy = NULL;
71 Dmsg0(400, "Initialising NicB-hacked watchdog thread\n");
72 watchdog_time = time(NULL);
74 if ((errstat=rwl_init(&lock)) != 0) {
75 Emsg1(M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
78 wd_queue = New(dlist(dummy, &dummy->link));
79 wd_inactive = New(dlist(dummy, &dummy->link));
81 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
89 * Wake watchdog timer thread so that it walks the
90 * queue and adjusts its wait time (or exits).
92 static void ping_watchdog()
95 pthread_cond_signal(&timer);
100 * Terminate the watchdog thread
102 * Returns: 0 on success
105 int stop_watchdog(void)
114 quit = true; /* notify watchdog thread to stop */
118 stat = pthread_join(wd_tid, NULL);
120 while (!wd_queue->empty()) {
121 void *item = wd_queue->first();
122 wd_queue->remove(item);
123 p = (watchdog_t *)item;
124 if (p->destructor != NULL) {
132 while (!wd_inactive->empty()) {
133 void *item = wd_inactive->first();
134 wd_inactive->remove(item);
135 p = (watchdog_t *)item;
136 if (p->destructor != NULL) {
148 watchdog_t *new_watchdog(void)
150 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
162 wd->destructor = NULL;
168 bool register_watchdog(watchdog_t *wd)
171 Emsg0(M_ABORT, 0, "BUG! register_watchdog called before start_watchdog\n");
173 if (wd->callback == NULL) {
174 Emsg1(M_ABORT, 0, "BUG! Watchdog %p has NULL callback\n", wd);
176 if (wd->interval == 0) {
177 Emsg1(M_ABORT, 0, "BUG! Watchdog %p has zero interval\n", wd);
181 wd->next_fire = watchdog_time + wd->interval;
182 wd_queue->append(wd);
183 Dmsg3(400, "Registered watchdog %p, interval %d%s\n",
184 wd, wd->interval, wd->one_shot ? " one shot" : "");
191 bool unregister_watchdog(watchdog_t *wd)
197 Emsg0(M_ABORT, 0, "BUG! unregister_watchdog_unlocked called before start_watchdog\n");
201 foreach_dlist(p, wd_queue) {
203 wd_queue->remove(wd);
204 Dmsg1(400, "Unregistered watchdog %p\n", wd);
210 foreach_dlist(p, wd_inactive) {
212 wd_inactive->remove(wd);
213 Dmsg1(400, "Unregistered inactive watchdog %p\n", wd);
219 Dmsg1(400, "Failed to unregister watchdog %p\n", wd);
227 extern "C" void *watchdog_thread(void *arg)
229 struct timespec timeout;
234 Dmsg0(400, "NicB-reworked watchdog thread entered\n");
240 * We lock the jcr chain here because a good number of the
241 * callback routines lock the jcr chain. We need to lock
242 * it here *before* the watchdog lock because the SD message
243 * thread first locks the jcr chain, then when closing the
244 * job locks the watchdog chain. If the two threads do not
245 * lock in the same order, we get a deadlock -- each holds
246 * the other's needed lock.
252 watchdog_time = time(NULL);
253 next_time = watchdog_time + watchdog_sleep_time;
254 foreach_dlist(p, wd_queue) {
255 if (p->next_fire <= watchdog_time) {
256 /* Run the callback */
259 /* Reschedule (or move to inactive list if it's a one-shot timer) */
262 wd_inactive->append(p);
265 p->next_fire = watchdog_time + p->interval;
268 if (p->next_fire < next_time) {
269 next_time = p->next_fire;
276 * Wait sleep time or until someone wakes us
278 gettimeofday(&tv, &tz);
279 timeout.tv_nsec = tv.tv_usec * 1000;
280 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
281 while (timeout.tv_nsec >= 1000000000) {
282 timeout.tv_nsec -= 1000000000;
286 Dmsg1(900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
287 /* Note, this unlocks mutex during the sleep */
289 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
293 Dmsg0(400, "NicB-reworked watchdog thread exited\n");
298 * Watchdog lock, this can be called multiple times by the same
299 * thread without blocking, but must be unlocked the number of
300 * times it was locked.
302 static void wd_lock()
305 if ((errstat=rwl_writelock(&lock)) != 0) {
306 Emsg1(M_ABORT, 0, "rwl_writelock failure. ERR=%s\n",
312 * Unlock the watchdog. This can be called multiple times by the
313 * same thread up to the number of times that thread called
316 static void wd_unlock()
319 if ((errstat=rwl_writeunlock(&lock)) != 0) {
320 Emsg1(M_ABORT, 0, "rwl_writeunlock failure. ERR=%s\n",