2 * Bacula thread watchdog routine. General routine that monitors
3 * the daemon and signals a thread if it is blocked on a BSOCK
4 * too long. This prevents catastropic long waits -- generally
5 * due to Windows "hanging" the app.
7 * Kern Sibbald, January MMII
11 Copyright (C) 2000-2004 Kern Sibbald and John Walker
13 This program is free software; you can redistribute it and/or
14 modify it under the terms of the GNU General Public License as
15 published by the Free Software Foundation; either version 2 of
16 the License, or (at your option) any later version.
18 This program is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 General Public License for more details.
23 You should have received a copy of the GNU General Public
24 License along with this program; if not, write to the Free
25 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
33 /* Exported globals */
34 time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
36 #define SLEEP_TIME 1 /* examine things every second */
38 /* Forward referenced functions */
39 extern "C" void *watchdog_thread(void *arg);
41 static void wd_lock();
42 static void wd_unlock();
45 static bool quit = false;;
46 static bool wd_is_init = false;
47 static brwlock_t lock; /* watchdog lock */
49 static pthread_t wd_tid;
50 static dlist *wd_queue;
51 static dlist *wd_inactive;
54 * Start watchdog thread
56 * Returns: 0 on success
59 int start_watchdog(void)
62 watchdog_t *dummy = NULL;
68 Dmsg0(400, "Initialising NicB-hacked watchdog thread\n");
69 watchdog_time = time(NULL);
71 if ((errstat=rwl_init(&lock)) != 0) {
72 Emsg1(M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
75 wd_queue = new dlist(wd_queue, &dummy->link);
76 wd_inactive = new dlist(wd_inactive, &dummy->link);
78 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
86 * Terminate the watchdog thread
88 * Returns: 0 on success
91 int stop_watchdog(void)
100 quit = true; /* notify watchdog thread to stop */
103 stat = pthread_join(wd_tid, NULL);
105 while (!wd_queue->empty()) {
106 void *item = wd_queue->first();
107 wd_queue->remove(item);
108 p = (watchdog_t *)item;
109 if (p->destructor != NULL) {
117 while (!wd_inactive->empty()) {
118 void *item = wd_inactive->first();
119 wd_inactive->remove(item);
120 p = (watchdog_t *)item;
121 if (p->destructor != NULL) {
133 watchdog_t *new_watchdog(void)
135 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
147 wd->destructor = NULL;
153 bool register_watchdog(watchdog_t *wd)
156 Emsg0(M_ABORT, 0, "BUG! register_watchdog called before start_watchdog\n");
158 if (wd->callback == NULL) {
159 Emsg1(M_ABORT, 0, "BUG! Watchdog %p has NULL callback\n", wd);
161 if (wd->interval == 0) {
162 Emsg1(M_ABORT, 0, "BUG! Watchdog %p has zero interval\n", wd);
166 wd->next_fire = watchdog_time + wd->interval;
167 wd_queue->append(wd);
168 Dmsg3(400, "Registered watchdog %p, interval %d%s\n",
169 wd, wd->interval, wd->one_shot ? " one shot" : "");
175 bool unregister_watchdog_unlocked(watchdog_t *wd)
180 Emsg0(M_ABORT, 0, "BUG! unregister_watchdog_unlocked called before start_watchdog\n");
183 foreach_dlist(p, wd_queue) {
185 wd_queue->remove(wd);
186 Dmsg1(400, "Unregistered watchdog %p\n", wd);
191 foreach_dlist(p, wd_inactive) {
193 wd_inactive->remove(wd);
194 Dmsg1(400, "Unregistered inactive watchdog %p\n", wd);
199 Dmsg1(400, "Failed to unregister watchdog %p\n", wd);
203 bool unregister_watchdog(watchdog_t *wd)
208 Emsg0(M_ABORT, 0, "BUG! unregister_watchdog called before start_watchdog\n");
212 ret = unregister_watchdog_unlocked(wd);
218 extern "C" void *watchdog_thread(void *arg)
220 Dmsg0(400, "NicB-reworked watchdog thread entered\n");
226 * We lock the jcr chain here because a good number of the
227 * callback routines lock the jcr chain. We need to lock
228 * it here *before* the watchdog lock because the SD message
229 * thread first locks the jcr chain, then when closing the
230 * job locks the watchdog chain. If the two threads do not
231 * lock in the same order, we get a deadlock -- each holds
232 * the other's needed lock.
236 watchdog_time = time(NULL);
238 foreach_dlist(p, wd_queue) {
239 if (p->next_fire < watchdog_time) {
240 /* Run the callback */
243 /* Reschedule (or move to inactive list if it's a one-shot timer) */
246 * Note, when removing an item while walking the list
247 * we must get the previous pointer (q) and set the
248 * current pointer (p) to this previous pointer after
249 * removing the current pointer, otherwise, we won't
250 * walk the rest of the list.
252 q = (watchdog_t *)wd_queue->prev(p);
254 wd_inactive->append(p);
257 p->next_fire = watchdog_time + p->interval;
263 bmicrosleep(SLEEP_TIME, 0);
266 Dmsg0(400, "NicB-reworked watchdog thread exited\n");
271 * Watchdog lock, this can be called multiple times by the same
272 * thread without blocking, but must be unlocked the number of
273 * times it was locked.
275 static void wd_lock()
278 if ((errstat=rwl_writelock(&lock)) != 0) {
279 Emsg1(M_ABORT, 0, "rwl_writelock failure. ERR=%s\n",
285 * Unlock the watchdog. This can be called multiple times by the
286 * same thread up to the number of times that thread called
289 static void wd_unlock()
292 if ((errstat=rwl_writeunlock(&lock)) != 0) {
293 Emsg1(M_ABORT, 0, "rwl_writeunlock failure. ERR=%s\n",