2 Bacula® - The Network Backup Solution
4 Copyright (C) 2002-2008 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version two of the GNU General Public
10 License as published by the Free Software Foundation and included
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of Kern Sibbald.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Bacula thread watchdog routine. General routine that
30 * allows setting a watchdog timer with a callback that is
31 * called when the timer goes off.
33 * Kern Sibbald, January MMII
40 /* Exported globals */
41 utime_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
42 utime_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
45 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
46 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
48 /* Forward referenced functions */
49 extern "C" void *watchdog_thread(void *arg);
51 static void wd_lock();
52 static void wd_unlock();
55 static bool quit = false;;
56 static bool wd_is_init = false;
57 static brwlock_t lock; /* watchdog lock */
59 static pthread_t wd_tid;
60 static dlist *wd_queue;
61 static dlist *wd_inactive;
64 * Start watchdog thread
66 * Returns: 0 on success
69 int start_watchdog(void)
72 watchdog_t *dummy = NULL;
78 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
79 watchdog_time = time(NULL);
81 if ((errstat=rwl_init(&lock)) != 0) {
83 Jmsg1(NULL, M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
84 be.bstrerror(errstat));
86 wd_queue = New(dlist(dummy, &dummy->link));
87 wd_inactive = New(dlist(dummy, &dummy->link));
90 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
97 * Wake watchdog timer thread so that it walks the
98 * queue and adjusts its wait time (or exits).
100 static void ping_watchdog()
103 pthread_cond_signal(&timer);
109 * Terminate the watchdog thread
111 * Returns: 0 on success
114 int stop_watchdog(void)
123 quit = true; /* notify watchdog thread to stop */
126 stat = pthread_join(wd_tid, NULL);
128 while (!wd_queue->empty()) {
129 void *item = wd_queue->first();
130 wd_queue->remove(item);
131 p = (watchdog_t *)item;
132 if (p->destructor != NULL) {
140 while (!wd_inactive->empty()) {
141 void *item = wd_inactive->first();
142 wd_inactive->remove(item);
143 p = (watchdog_t *)item;
144 if (p->destructor != NULL) {
157 watchdog_t *new_watchdog(void)
159 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
171 wd->destructor = NULL;
177 bool register_watchdog(watchdog_t *wd)
180 Jmsg0(NULL, M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
182 if (wd->callback == NULL) {
183 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
185 if (wd->interval == 0) {
186 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
190 wd->next_fire = watchdog_time + wd->interval;
191 wd_queue->append(wd);
192 Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
193 wd, wd->interval, wd->one_shot ? " one shot" : "");
200 bool unregister_watchdog(watchdog_t *wd)
206 Jmsg0(NULL, M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
210 foreach_dlist(p, wd_queue) {
212 wd_queue->remove(wd);
213 Dmsg1(800, "Unregistered watchdog %p\n", wd);
219 foreach_dlist(p, wd_inactive) {
221 wd_inactive->remove(wd);
222 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
228 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
237 * This is the thread that walks the watchdog queue
238 * and when a queue item fires, the callback is
239 * invoked. If it is a one shot, the queue item
240 * is moved to the inactive queue.
242 extern "C" void *watchdog_thread(void *arg)
244 struct timespec timeout;
249 set_jcr_in_tsd(INVALID_JCR);
250 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
257 * NOTE. lock_jcr_chain removed, but the message below
258 * was left until we are sure there are no deadlocks.
260 * We lock the jcr chain here because a good number of the
261 * callback routines lock the jcr chain. We need to lock
262 * it here *before* the watchdog lock because the SD message
263 * thread first locks the jcr chain, then when closing the
264 * job locks the watchdog chain. If the two threads do not
265 * lock in the same order, we get a deadlock -- each holds
266 * the other's needed lock.
271 watchdog_time = time(NULL);
272 next_time = watchdog_time + watchdog_sleep_time;
273 foreach_dlist(p, wd_queue) {
274 if (p->next_fire <= watchdog_time) {
275 /* Run the callback */
276 Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
279 /* Reschedule (or move to inactive list if it's a one-shot timer) */
282 wd_inactive->append(p);
285 p->next_fire = watchdog_time + p->interval;
288 if (p->next_fire <= next_time) {
289 next_time = p->next_fire;
295 * Wait sleep time or until someone wakes us
297 gettimeofday(&tv, &tz);
298 timeout.tv_nsec = tv.tv_usec * 1000;
299 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
300 while (timeout.tv_nsec >= 1000000000) {
301 timeout.tv_nsec -= 1000000000;
305 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
306 /* Note, this unlocks mutex during the sleep */
308 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
312 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
317 * Watchdog lock, this can be called multiple times by the same
318 * thread without blocking, but must be unlocked the number of
319 * times it was locked.
321 static void wd_lock()
324 if ((errstat=rwl_writelock(&lock)) != 0) {
326 Jmsg1(NULL, M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
327 be.bstrerror(errstat));
332 * Unlock the watchdog. This can be called multiple times by the
333 * same thread up to the number of times that thread called
336 static void wd_unlock()
339 if ((errstat=rwl_writeunlock(&lock)) != 0) {
341 Jmsg1(NULL, M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),
342 be.bstrerror(errstat));