2 Bacula® - The Network Backup Solution
4 Copyright (C) 2002-2011 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version three of the GNU Affero General Public
10 License as published by the Free Software Foundation and included
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU Affero General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of Kern Sibbald.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Bacula thread watchdog routine. General routine that
30 * allows setting a watchdog timer with a callback that is
31 * called when the timer goes off.
33 * Kern Sibbald, January MMII
40 /* Exported globals */
41 utime_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
42 utime_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
45 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
46 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
48 /* Forward referenced functions */
49 extern "C" void *watchdog_thread(void *arg);
51 static void wd_lock();
52 static void wd_unlock();
55 static bool quit = false;;
56 static bool wd_is_init = false;
57 static brwlock_t lock; /* watchdog lock */
59 static pthread_t wd_tid;
60 static dlist *wd_queue;
61 static dlist *wd_inactive;
64 * Returns: 0 if the current thread is NOT the watchdog
65 * 1 if the current thread is the watchdog
69 if (wd_is_init && pthread_equal(pthread_self(), wd_tid)) {
77 * Start watchdog thread
79 * Returns: 0 on success
82 int start_watchdog(void)
85 watchdog_t *dummy = NULL;
91 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
92 watchdog_time = time(NULL);
94 if ((errstat=rwl_init(&lock)) != 0) {
96 Jmsg1(NULL, M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
97 be.bstrerror(errstat));
99 wd_queue = New(dlist(dummy, &dummy->link));
100 wd_inactive = New(dlist(dummy, &dummy->link));
103 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
110 * Wake watchdog timer thread so that it walks the
111 * queue and adjusts its wait time (or exits).
113 static void ping_watchdog()
116 pthread_cond_signal(&timer);
122 * Terminate the watchdog thread
124 * Returns: 0 on success
127 int stop_watchdog(void)
136 quit = true; /* notify watchdog thread to stop */
139 stat = pthread_join(wd_tid, NULL);
141 while (!wd_queue->empty()) {
142 void *item = wd_queue->first();
143 wd_queue->remove(item);
144 p = (watchdog_t *)item;
145 if (p->destructor != NULL) {
153 while (!wd_inactive->empty()) {
154 void *item = wd_inactive->first();
155 wd_inactive->remove(item);
156 p = (watchdog_t *)item;
157 if (p->destructor != NULL) {
170 watchdog_t *new_watchdog(void)
172 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
184 wd->destructor = NULL;
190 bool register_watchdog(watchdog_t *wd)
193 Jmsg0(NULL, M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
195 if (wd->callback == NULL) {
196 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
198 if (wd->interval == 0) {
199 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
203 wd->next_fire = watchdog_time + wd->interval;
204 wd_queue->append(wd);
205 Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
206 wd, wd->interval, wd->one_shot ? " one shot" : "");
213 bool unregister_watchdog(watchdog_t *wd)
219 Jmsg0(NULL, M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
223 foreach_dlist(p, wd_queue) {
225 wd_queue->remove(wd);
226 Dmsg1(800, "Unregistered watchdog %p\n", wd);
232 foreach_dlist(p, wd_inactive) {
234 wd_inactive->remove(wd);
235 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
241 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
250 * This is the thread that walks the watchdog queue
251 * and when a queue item fires, the callback is
252 * invoked. If it is a one shot, the queue item
253 * is moved to the inactive queue.
255 extern "C" void *watchdog_thread(void *arg)
257 struct timespec timeout;
262 set_jcr_in_tsd(INVALID_JCR);
263 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
270 * NOTE. lock_jcr_chain removed, but the message below
271 * was left until we are sure there are no deadlocks.
273 * We lock the jcr chain here because a good number of the
274 * callback routines lock the jcr chain. We need to lock
275 * it here *before* the watchdog lock because the SD message
276 * thread first locks the jcr chain, then when closing the
277 * job locks the watchdog chain. If the two threads do not
278 * lock in the same order, we get a deadlock -- each holds
279 * the other's needed lock.
284 watchdog_time = time(NULL);
285 next_time = watchdog_time + watchdog_sleep_time;
286 foreach_dlist(p, wd_queue) {
287 if (p->next_fire <= watchdog_time) {
288 /* Run the callback */
289 Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
292 /* Reschedule (or move to inactive list if it's a one-shot timer) */
295 wd_inactive->append(p);
298 p->next_fire = watchdog_time + p->interval;
301 if (p->next_fire <= next_time) {
302 next_time = p->next_fire;
308 * Wait sleep time or until someone wakes us
310 gettimeofday(&tv, &tz);
311 timeout.tv_nsec = tv.tv_usec * 1000;
312 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
313 while (timeout.tv_nsec >= 1000000000) {
314 timeout.tv_nsec -= 1000000000;
318 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
319 /* Note, this unlocks mutex during the sleep */
321 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
325 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
330 * Watchdog lock, this can be called multiple times by the same
331 * thread without blocking, but must be unlocked the number of
332 * times it was locked.
334 static void wd_lock()
337 if ((errstat=rwl_writelock(&lock)) != 0) {
339 Jmsg1(NULL, M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
340 be.bstrerror(errstat));
345 * Unlock the watchdog. This can be called multiple times by the
346 * same thread up to the number of times that thread called
349 static void wd_unlock()
352 if ((errstat=rwl_writeunlock(&lock)) != 0) {
354 Jmsg1(NULL, M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),
355 be.bstrerror(errstat));