2 * Bacula thread watchdog routine. General routine that monitors
3 * the daemon and signals a thread if it is blocked on a BSOCK
4 * too long. This prevents catastropic long waits -- generally
5 * due to Windows "hanging" the app.
7 * Kern Sibbald, January MMII
11 Copyright (C) 2000-2004 Kern Sibbald and John Walker
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of
16 the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public
24 License along with this program; if not, write to the Free
25 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
33 /* Exported globals */
34 time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
36 #define SLEEP_TIME 1 /* examine things every second */
38 /* Forward referenced functions */
39 static void *watchdog_thread(void *arg);
40 static void wd_lock();
41 static void wd_unlock();
44 static bool quit = false;;
45 static bool wd_is_init = false;
46 static brwlock_t lock; /* watchdog lock */
48 static pthread_t wd_tid;
49 static dlist *wd_queue;
50 static dlist *wd_inactive;
53 * Start watchdog thread
55 * Returns: 0 on success
58 int start_watchdog(void)
61 watchdog_t *dummy = NULL;
67 Dmsg0(400, "Initialising NicB-hacked watchdog thread\n");
68 watchdog_time = time(NULL);
70 if ((errstat=rwl_init(&lock)) != 0) {
71 Emsg1(M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
74 wd_queue = new dlist(wd_queue, &dummy->link);
75 wd_inactive = new dlist(wd_inactive, &dummy->link);
77 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
85 * Terminate the watchdog thread
87 * Returns: 0 on success
90 int stop_watchdog(void)
99 quit = true; /* notify watchdog thread to stop */
102 stat = pthread_join(wd_tid, NULL);
104 while (!wd_queue->empty()) {
105 void *item = wd_queue->first();
106 wd_queue->remove(item);
107 p = (watchdog_t *)item;
108 if (p->destructor != NULL) {
116 while (!wd_inactive->empty()) {
117 void *item = wd_inactive->first();
118 wd_inactive->remove(item);
119 p = (watchdog_t *)item;
120 if (p->destructor != NULL) {
132 watchdog_t *new_watchdog(void)
134 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
146 wd->destructor = NULL;
152 bool register_watchdog(watchdog_t *wd)
155 Emsg0(M_ABORT, 0, "BUG! register_watchdog called before start_watchdog\n");
157 if (wd->callback == NULL) {
158 Emsg1(M_ABORT, 0, "BUG! Watchdog %p has NULL callback\n", wd);
160 if (wd->interval == 0) {
161 Emsg1(M_ABORT, 0, "BUG! Watchdog %p has zero interval\n", wd);
165 wd->next_fire = watchdog_time + wd->interval;
166 wd_queue->append(wd);
167 Dmsg3(400, "Registered watchdog %p, interval %d%s\n",
168 wd, wd->interval, wd->one_shot ? " one shot" : "");
174 bool unregister_watchdog_unlocked(watchdog_t *wd)
179 Emsg0(M_ABORT, 0, "BUG! unregister_watchdog_unlocked called before start_watchdog\n");
182 foreach_dlist(p, wd_queue) {
184 wd_queue->remove(wd);
185 Dmsg1(400, "Unregistered watchdog %p\n", wd);
190 foreach_dlist(p, wd_inactive) {
192 wd_inactive->remove(wd);
193 Dmsg1(400, "Unregistered inactive watchdog %p\n", wd);
198 Dmsg1(400, "Failed to unregister watchdog %p\n", wd);
202 bool unregister_watchdog(watchdog_t *wd)
207 Emsg0(M_ABORT, 0, "BUG! unregister_watchdog called before start_watchdog\n");
211 ret = unregister_watchdog_unlocked(wd);
217 static void *watchdog_thread(void *arg)
219 Dmsg0(400, "NicB-reworked watchdog thread entered\n");
225 * We lock the jcr chain here because a good number of the
226 * callback routines lock the jcr chain. We need to lock
227 * it here *before* the watchdog lock because the SD message
228 * thread first locks the jcr chain, then when closing the
229 * job locks the watchdog chain. If the two threads do not
230 * lock in the same order, we get a deadlock -- each holds
231 * the other's needed lock.
235 watchdog_time = time(NULL);
237 foreach_dlist(p, wd_queue) {
238 if (p->next_fire < watchdog_time) {
239 /* Run the callback */
242 /* Reschedule (or move to inactive list if it's a one-shot timer) */
245 * Note, when removing an item while walking the list
246 * we must get the previous pointer (q) and set the
247 * current pointer (p) to this previous pointer after
248 * removing the current pointer, otherwise, we won't
249 * walk the rest of the list.
251 q = (watchdog_t *)wd_queue->prev(p);
253 wd_inactive->append(p);
256 p->next_fire = watchdog_time + p->interval;
262 bmicrosleep(SLEEP_TIME, 0);
265 Dmsg0(400, "NicB-reworked watchdog thread exited\n");
270 * Watchdog lock, this can be called multiple times by the same
271 * thread without blocking, but must be unlocked the number of
272 * times it was locked.
274 static void wd_lock()
277 if ((errstat=rwl_writelock(&lock)) != 0) {
278 Emsg1(M_ABORT, 0, "rwl_writelock failure. ERR=%s\n",
284 * Unlock the watchdog. This can be called multiple times by the
285 * same thread up to the number of times that thread called
288 static void wd_unlock()
291 if ((errstat=rwl_writeunlock(&lock)) != 0) {
292 Emsg1(M_ABORT, 0, "rwl_writeunlock failure. ERR=%s\n",