2 Bacula® - The Network Backup Solution
4 Copyright (C) 2002-2014 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from many
7 others, a complete list can be found in the file AUTHORS.
9 You may use this file and others of this release according to the
10 license defined in the LICENSE file, which includes the Affero General
11 Public License, v3.0 ("AGPLv3") and some additional permissions and
12 terms pursuant to its AGPLv3 Section 7.
14 Bacula® is a registered trademark of Kern Sibbald.
17 * Bacula thread watchdog routine. General routine that
18 * allows setting a watchdog timer with a callback that is
19 * called when the timer goes off.
21 * Kern Sibbald, January MMII
28 /* Exported globals */
29 utime_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
30 utime_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
33 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
34 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
36 /* Forward referenced functions */
37 extern "C" void *watchdog_thread(void *arg);
39 static void wd_lock();
40 static void wd_unlock();
43 static bool quit = false;;
44 static bool wd_is_init = false;
45 static brwlock_t lock; /* watchdog lock */
47 static pthread_t wd_tid;
48 static dlist *wd_queue;
49 static dlist *wd_inactive;
52 * Returns: 0 if the current thread is NOT the watchdog
53 * 1 if the current thread is the watchdog
57 if (wd_is_init && pthread_equal(pthread_self(), wd_tid)) {
65 * Start watchdog thread
67 * Returns: 0 on success
70 int start_watchdog(void)
73 watchdog_t *dummy = NULL;
79 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
80 watchdog_time = time(NULL);
82 if ((errstat=rwl_init(&lock)) != 0) {
84 Jmsg1(NULL, M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
85 be.bstrerror(errstat));
87 wd_queue = New(dlist(dummy, &dummy->link));
88 wd_inactive = New(dlist(dummy, &dummy->link));
91 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
98 * Wake watchdog timer thread so that it walks the
99 * queue and adjusts its wait time (or exits).
101 static void ping_watchdog()
104 pthread_cond_signal(&timer);
110 * Terminate the watchdog thread
112 * Returns: 0 on success
115 int stop_watchdog(void)
124 quit = true; /* notify watchdog thread to stop */
127 stat = pthread_join(wd_tid, NULL);
129 while (!wd_queue->empty()) {
130 void *item = wd_queue->first();
131 wd_queue->remove(item);
132 p = (watchdog_t *)item;
133 if (p->destructor != NULL) {
141 while (!wd_inactive->empty()) {
142 void *item = wd_inactive->first();
143 wd_inactive->remove(item);
144 p = (watchdog_t *)item;
145 if (p->destructor != NULL) {
158 watchdog_t *new_watchdog(void)
160 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
172 wd->destructor = NULL;
178 bool register_watchdog(watchdog_t *wd)
181 Jmsg0(NULL, M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
183 if (wd->callback == NULL) {
184 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
186 if (wd->interval == 0) {
187 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
191 wd->next_fire = watchdog_time + wd->interval;
192 wd_queue->append(wd);
193 Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
194 wd, wd->interval, wd->one_shot ? " one shot" : "");
201 bool unregister_watchdog(watchdog_t *wd)
207 Jmsg0(NULL, M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
211 foreach_dlist(p, wd_queue) {
213 wd_queue->remove(wd);
214 Dmsg1(800, "Unregistered watchdog %p\n", wd);
220 foreach_dlist(p, wd_inactive) {
222 wd_inactive->remove(wd);
223 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
229 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
238 * This is the thread that walks the watchdog queue
239 * and when a queue item fires, the callback is
240 * invoked. If it is a one shot, the queue item
241 * is moved to the inactive queue.
243 extern "C" void *watchdog_thread(void *arg)
245 struct timespec timeout;
250 set_jcr_in_tsd(INVALID_JCR);
251 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
258 * NOTE. lock_jcr_chain removed, but the message below
259 * was left until we are sure there are no deadlocks.
261 * We lock the jcr chain here because a good number of the
262 * callback routines lock the jcr chain. We need to lock
263 * it here *before* the watchdog lock because the SD message
264 * thread first locks the jcr chain, then when closing the
265 * job locks the watchdog chain. If the two threads do not
266 * lock in the same order, we get a deadlock -- each holds
267 * the other's needed lock.
272 watchdog_time = time(NULL);
273 next_time = watchdog_time + watchdog_sleep_time;
274 foreach_dlist(p, wd_queue) {
275 if (p->next_fire <= watchdog_time) {
276 /* Run the callback */
277 Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
280 /* Reschedule (or move to inactive list if it's a one-shot timer) */
283 wd_inactive->append(p);
286 p->next_fire = watchdog_time + p->interval;
289 if (p->next_fire <= next_time) {
290 next_time = p->next_fire;
296 * Wait sleep time or until someone wakes us
298 gettimeofday(&tv, &tz);
299 timeout.tv_nsec = tv.tv_usec * 1000;
300 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
301 while (timeout.tv_nsec >= 1000000000) {
302 timeout.tv_nsec -= 1000000000;
306 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
307 /* Note, this unlocks mutex during the sleep */
309 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
313 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
318 * Watchdog lock, this can be called multiple times by the same
319 * thread without blocking, but must be unlocked the number of
320 * times it was locked.
322 static void wd_lock()
325 if ((errstat=rwl_writelock(&lock)) != 0) {
327 Jmsg1(NULL, M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
328 be.bstrerror(errstat));
333 * Unlock the watchdog. This can be called multiple times by the
334 * same thread up to the number of times that thread called
337 static void wd_unlock()
340 if ((errstat=rwl_writeunlock(&lock)) != 0) {
342 Jmsg1(NULL, M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),
343 be.bstrerror(errstat));