2 * Bacula thread watchdog routine. General routine that
3 * allows setting a watchdog timer with a callback that is
4 * called when the timer goes off.
6 * Kern Sibbald, January MMII
10 Bacula® - The Network Backup Solution
12 Copyright (C) 2002-2006 Free Software Foundation Europe e.V.
14 The main author of Bacula is Kern Sibbald, with contributions from
15 many others, a complete list can be found in the file AUTHORS.
16 This program is Free Software; you can redistribute it and/or
17 modify it under the terms of version two of the GNU General Public
18 License as published by the Free Software Foundation plus additions
19 that are listed in the file LICENSE.
21 This program is distributed in the hope that it will be useful, but
22 WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
26 You should have received a copy of the GNU General Public License
27 along with this program; if not, write to the Free Software
28 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
31 Bacula® is a registered trademark of John Walker.
32 The licensor of Bacula is the Free Software Foundation Europe
33 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
34 Switzerland, email:ftf@fsfeurope.org.
40 /* Exported globals */
41 time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
42 time_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
45 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
46 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
48 /* Forward referenced functions */
49 extern "C" void *watchdog_thread(void *arg);
51 static void wd_lock();
52 static void wd_unlock();
55 static bool quit = false;;
56 static bool wd_is_init = false;
57 static brwlock_t lock; /* watchdog lock */
59 static pthread_t wd_tid;
60 static dlist *wd_queue;
61 static dlist *wd_inactive;
64 * Start watchdog thread
66 * Returns: 0 on success
69 int start_watchdog(void)
72 watchdog_t *dummy = NULL;
78 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
79 watchdog_time = time(NULL);
81 if ((errstat=rwl_init(&lock)) != 0) {
82 Emsg1(M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
85 wd_queue = New(dlist(dummy, &dummy->link));
86 wd_inactive = New(dlist(dummy, &dummy->link));
89 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
96 * Wake watchdog timer thread so that it walks the
97 * queue and adjusts its wait time (or exits).
99 static void ping_watchdog()
102 pthread_cond_signal(&timer);
107 * Terminate the watchdog thread
109 * Returns: 0 on success
112 int stop_watchdog(void)
121 quit = true; /* notify watchdog thread to stop */
125 stat = pthread_join(wd_tid, NULL);
127 while (!wd_queue->empty()) {
128 void *item = wd_queue->first();
129 wd_queue->remove(item);
130 p = (watchdog_t *)item;
131 if (p->destructor != NULL) {
139 while (!wd_inactive->empty()) {
140 void *item = wd_inactive->first();
141 wd_inactive->remove(item);
142 p = (watchdog_t *)item;
143 if (p->destructor != NULL) {
155 watchdog_t *new_watchdog(void)
157 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
169 wd->destructor = NULL;
175 bool register_watchdog(watchdog_t *wd)
178 Emsg0(M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
180 if (wd->callback == NULL) {
181 Emsg1(M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
183 if (wd->interval == 0) {
184 Emsg1(M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
188 wd->next_fire = watchdog_time + wd->interval;
189 wd_queue->append(wd);
190 Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
191 wd, wd->interval, wd->one_shot ? " one shot" : "");
198 bool unregister_watchdog(watchdog_t *wd)
204 Emsg0(M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
208 foreach_dlist(p, wd_queue) {
210 wd_queue->remove(wd);
211 Dmsg1(800, "Unregistered watchdog %p\n", wd);
217 foreach_dlist(p, wd_inactive) {
219 wd_inactive->remove(wd);
220 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
226 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
235 * This is the thread that walks the watchdog queue
236 * and when a queue item fires, the callback is
237 * invoked. If it is a one shot, the queue item
238 * is moved to the inactive queue.
240 extern "C" void *watchdog_thread(void *arg)
242 struct timespec timeout;
247 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
254 * NOTE. lock_jcr_chain removed, but the message below
255 * was left until we are sure there are no deadlocks.
257 * We lock the jcr chain here because a good number of the
258 * callback routines lock the jcr chain. We need to lock
259 * it here *before* the watchdog lock because the SD message
260 * thread first locks the jcr chain, then when closing the
261 * job locks the watchdog chain. If the two threads do not
262 * lock in the same order, we get a deadlock -- each holds
263 * the other's needed lock.
268 watchdog_time = time(NULL);
269 next_time = watchdog_time + watchdog_sleep_time;
270 foreach_dlist(p, wd_queue) {
271 if (p->next_fire <= watchdog_time) {
272 /* Run the callback */
273 Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
276 /* Reschedule (or move to inactive list if it's a one-shot timer) */
279 wd_inactive->append(p);
282 p->next_fire = watchdog_time + p->interval;
285 if (p->next_fire <= next_time) {
286 next_time = p->next_fire;
292 * Wait sleep time or until someone wakes us
294 gettimeofday(&tv, &tz);
295 timeout.tv_nsec = tv.tv_usec * 1000;
296 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
297 while (timeout.tv_nsec >= 1000000000) {
298 timeout.tv_nsec -= 1000000000;
302 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
303 /* Note, this unlocks mutex during the sleep */
305 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
309 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
314 * Watchdog lock, this can be called multiple times by the same
315 * thread without blocking, but must be unlocked the number of
316 * times it was locked.
318 static void wd_lock()
321 if ((errstat=rwl_writelock(&lock)) != 0) {
322 Emsg1(M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
328 * Unlock the watchdog. This can be called multiple times by the
329 * same thread up to the number of times that thread called
332 static void wd_unlock()
335 if ((errstat=rwl_writeunlock(&lock)) != 0) {
336 Emsg1(M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),