2 * Bacula thread watchdog routine. General routine that
3 * allows setting a watchdog timer with a callback that is
4 * called when the timer goes off.
6 * Kern Sibbald, January MMII
10 Copyright (C) 2000-2006 Kern Sibbald
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License
14 version 2 as amended with additional clauses defined in the
15 file LICENSE in the main source directory.
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 the file LICENSE for additional details.
27 /* Exported globals */
28 time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
29 time_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
32 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
33 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
35 /* Forward referenced functions */
36 extern "C" void *watchdog_thread(void *arg);
38 static void wd_lock();
39 static void wd_unlock();
42 static bool quit = false;;
43 static bool wd_is_init = false;
44 static brwlock_t lock; /* watchdog lock */
46 static pthread_t wd_tid;
47 static dlist *wd_queue;
48 static dlist *wd_inactive;
51 * Start watchdog thread
53 * Returns: 0 on success
56 int start_watchdog(void)
59 watchdog_t *dummy = NULL;
65 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
66 watchdog_time = time(NULL);
68 if ((errstat=rwl_init(&lock)) != 0) {
69 Emsg1(M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
72 wd_queue = New(dlist(dummy, &dummy->link));
73 wd_inactive = New(dlist(dummy, &dummy->link));
75 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
83 * Wake watchdog timer thread so that it walks the
84 * queue and adjusts its wait time (or exits).
86 static void ping_watchdog()
89 pthread_cond_signal(&timer);
94 * Terminate the watchdog thread
96 * Returns: 0 on success
99 int stop_watchdog(void)
108 quit = true; /* notify watchdog thread to stop */
112 stat = pthread_join(wd_tid, NULL);
114 while (!wd_queue->empty()) {
115 void *item = wd_queue->first();
116 wd_queue->remove(item);
117 p = (watchdog_t *)item;
118 if (p->destructor != NULL) {
126 while (!wd_inactive->empty()) {
127 void *item = wd_inactive->first();
128 wd_inactive->remove(item);
129 p = (watchdog_t *)item;
130 if (p->destructor != NULL) {
142 watchdog_t *new_watchdog(void)
144 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
156 wd->destructor = NULL;
162 bool register_watchdog(watchdog_t *wd)
165 Emsg0(M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
167 if (wd->callback == NULL) {
168 Emsg1(M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
170 if (wd->interval == 0) {
171 Emsg1(M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
175 wd->next_fire = watchdog_time + wd->interval;
176 wd_queue->append(wd);
177 Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
178 wd, wd->interval, wd->one_shot ? " one shot" : "");
185 bool unregister_watchdog(watchdog_t *wd)
191 Emsg0(M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
195 foreach_dlist(p, wd_queue) {
197 wd_queue->remove(wd);
198 Dmsg1(800, "Unregistered watchdog %p\n", wd);
204 foreach_dlist(p, wd_inactive) {
206 wd_inactive->remove(wd);
207 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
213 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
222 * This is the thread that walks the watchdog queue
223 * and when a queue item fires, the callback is
224 * invoked. If it is a one shot, the queue item
225 * is moved to the inactive queue.
227 extern "C" void *watchdog_thread(void *arg)
229 struct timespec timeout;
234 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
241 * NOTE. lock_jcr_chain removed, but the message below
242 * was left until we are sure there are no deadlocks.
244 * We lock the jcr chain here because a good number of the
245 * callback routines lock the jcr chain. We need to lock
246 * it here *before* the watchdog lock because the SD message
247 * thread first locks the jcr chain, then when closing the
248 * job locks the watchdog chain. If the two threads do not
249 * lock in the same order, we get a deadlock -- each holds
250 * the other's needed lock.
255 watchdog_time = time(NULL);
256 next_time = watchdog_time + watchdog_sleep_time;
257 foreach_dlist(p, wd_queue) {
258 if (p->next_fire <= watchdog_time) {
259 /* Run the callback */
260 Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
263 /* Reschedule (or move to inactive list if it's a one-shot timer) */
266 wd_inactive->append(p);
269 p->next_fire = watchdog_time + p->interval;
272 if (p->next_fire <= next_time) {
273 next_time = p->next_fire;
279 * Wait sleep time or until someone wakes us
281 gettimeofday(&tv, &tz);
282 timeout.tv_nsec = tv.tv_usec * 1000;
283 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
284 while (timeout.tv_nsec >= 1000000000) {
285 timeout.tv_nsec -= 1000000000;
289 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
290 /* Note, this unlocks mutex during the sleep */
292 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
296 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
301 * Watchdog lock, this can be called multiple times by the same
302 * thread without blocking, but must be unlocked the number of
303 * times it was locked.
305 static void wd_lock()
308 if ((errstat=rwl_writelock(&lock)) != 0) {
309 Emsg1(M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
315 * Unlock the watchdog. This can be called multiple times by the
316 * same thread up to the number of times that thread called
319 static void wd_unlock()
322 if ((errstat=rwl_writeunlock(&lock)) != 0) {
323 Emsg1(M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),