2 * Bacula thread watchdog routine. General routine that
3 * allows setting a watchdog timer with a callback that is
4 * called when the timer goes off.
6 * Kern Sibbald, January MMII
10 Copyright (C) 2000-2005 Kern Sibbald
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of
15 the License, or (at your option) any later version.
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public
23 License along with this program; if not, write to the Free
24 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
32 /* Exported globals */
33 time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
34 time_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
37 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
38 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
40 /* Forward referenced functions */
41 extern "C" void *watchdog_thread(void *arg);
43 static void wd_lock();
44 static void wd_unlock();
47 static bool quit = false;;
48 static bool wd_is_init = false;
49 static brwlock_t lock; /* watchdog lock */
51 static pthread_t wd_tid;
52 static dlist *wd_queue;
53 static dlist *wd_inactive;
56 * Start watchdog thread
58 * Returns: 0 on success
61 int start_watchdog(void)
64 watchdog_t *dummy = NULL;
70 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
71 watchdog_time = time(NULL);
73 if ((errstat=rwl_init(&lock)) != 0) {
74 Emsg1(M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
77 wd_queue = New(dlist(dummy, &dummy->link));
78 wd_inactive = New(dlist(dummy, &dummy->link));
80 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
88 * Wake watchdog timer thread so that it walks the
89 * queue and adjusts its wait time (or exits).
91 static void ping_watchdog()
94 pthread_cond_signal(&timer);
99 * Terminate the watchdog thread
101 * Returns: 0 on success
104 int stop_watchdog(void)
113 quit = true; /* notify watchdog thread to stop */
117 stat = pthread_join(wd_tid, NULL);
119 while (!wd_queue->empty()) {
120 void *item = wd_queue->first();
121 wd_queue->remove(item);
122 p = (watchdog_t *)item;
123 if (p->destructor != NULL) {
131 while (!wd_inactive->empty()) {
132 void *item = wd_inactive->first();
133 wd_inactive->remove(item);
134 p = (watchdog_t *)item;
135 if (p->destructor != NULL) {
147 watchdog_t *new_watchdog(void)
149 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
161 wd->destructor = NULL;
167 bool register_watchdog(watchdog_t *wd)
170 Emsg0(M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
172 if (wd->callback == NULL) {
173 Emsg1(M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
175 if (wd->interval == 0) {
176 Emsg1(M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
180 wd->next_fire = watchdog_time + wd->interval;
181 wd_queue->append(wd);
182 Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
183 wd, wd->interval, wd->one_shot ? " one shot" : "");
190 bool unregister_watchdog(watchdog_t *wd)
196 Emsg0(M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
200 foreach_dlist(p, wd_queue) {
202 wd_queue->remove(wd);
203 Dmsg1(800, "Unregistered watchdog %p\n", wd);
209 foreach_dlist(p, wd_inactive) {
211 wd_inactive->remove(wd);
212 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
218 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
227 * This is the thread that walks the watchdog queue
228 * and when a queue item fires, the callback is
229 * invoked. If it is a one shot, the queue item
230 * is moved to the inactive queue.
232 extern "C" void *watchdog_thread(void *arg)
234 struct timespec timeout;
239 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
246 * NOTE. lock_jcr_chain removed, but the message below
247 * was left until we are sure there are no deadlocks.
249 * We lock the jcr chain here because a good number of the
250 * callback routines lock the jcr chain. We need to lock
251 * it here *before* the watchdog lock because the SD message
252 * thread first locks the jcr chain, then when closing the
253 * job locks the watchdog chain. If the two threads do not
254 * lock in the same order, we get a deadlock -- each holds
255 * the other's needed lock.
260 watchdog_time = time(NULL);
261 next_time = watchdog_time + watchdog_sleep_time;
262 foreach_dlist(p, wd_queue) {
263 if (p->next_fire <= watchdog_time) {
264 /* Run the callback */
267 /* Reschedule (or move to inactive list if it's a one-shot timer) */
270 wd_inactive->append(p);
273 p->next_fire = watchdog_time + p->interval;
276 if (p->next_fire < next_time) {
277 next_time = p->next_fire;
283 * Wait sleep time or until someone wakes us
285 gettimeofday(&tv, &tz);
286 timeout.tv_nsec = tv.tv_usec * 1000;
287 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
288 while (timeout.tv_nsec >= 1000000000) {
289 timeout.tv_nsec -= 1000000000;
293 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
294 /* Note, this unlocks mutex during the sleep */
296 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
300 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
305 * Watchdog lock, this can be called multiple times by the same
306 * thread without blocking, but must be unlocked the number of
307 * times it was locked.
309 static void wd_lock()
312 if ((errstat=rwl_writelock(&lock)) != 0) {
313 Emsg1(M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
319 * Unlock the watchdog. This can be called multiple times by the
320 * same thread up to the number of times that thread called
323 static void wd_unlock()
326 if ((errstat=rwl_writeunlock(&lock)) != 0) {
327 Emsg1(M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),