2 Bacula® - The Network Backup Solution
4 Copyright (C) 2000-2007 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version two of the GNU General Public
10 License as published by the Free Software Foundation plus additions
11 that are listed in the file LICENSE.
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of John Walker.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Manipulation routines for Job Control Records and
30 * handling of last_jobs_list.
32 * Kern E. Sibbald, December 2000
36 * These routines are thread safe.
38 * The job list routines were re-written in May 2005 to
39 * eliminate the global lock while traversing the list, and
40 * to use the dlist subroutines. The locking is now done
41 * on the list each time the list is modified or traversed.
42 * That is it is "micro-locked" rather than globally locked.
43 * The result is that there is one lock/unlock for each entry
44 * in the list while traversing it rather than a single lock
45 * at the beginning of a traversal and one at the end. This
46 * incurs slightly more overhead, but effectively eliminates
47 * the possibilty of race conditions. In addition, with the
48 * exception of the global locking of the list during the
49 * re-reading of the config file, no recursion is needed.
56 /* External variables we reference */
57 extern time_t watchdog_time;
59 /* External referenced functions */
60 void free_bregexps(alist *bregexps);
62 /* Forward referenced functions */
63 extern "C" void timeout_handler(int sig);
64 static void jcr_timeout_check(watchdog_t *self);
65 #ifdef TRACE_JCR_CHAIN
66 static void b_lock_jcr_chain(const char *filen, int line);
67 static void b_unlock_jcr_chain(const char *filen, int line);
68 #define lock_jcr_chain() b_lock_jcr_chain(__FILE__, __LINE__);
69 #define unlock_jcr_chain() b_unlock_jcr_chain(__FILE__, __LINE__);
71 static void lock_jcr_chain();
72 static void unlock_jcr_chain();
77 dlist *last_jobs = NULL;
78 const int max_last_jobs = 10;
80 static dlist *jcrs = NULL; /* JCR chain */
81 static pthread_mutex_t jcr_lock = PTHREAD_MUTEX_INITIALIZER;
83 static pthread_mutex_t job_start_mutex = PTHREAD_MUTEX_INITIALIZER;
85 static pthread_mutex_t last_jobs_mutex = PTHREAD_MUTEX_INITIALIZER;
98 void init_last_jobs_list()
101 struct s_last_job *job_entry = NULL;
103 last_jobs = New(dlist(job_entry, &job_entry->link));
106 jcrs = New(dlist(jcr, &jcr->link));
110 void term_last_jobs_list()
113 while (!last_jobs->empty()) {
114 void *je = last_jobs->first();
115 last_jobs->remove(je);
127 bool read_last_jobs_list(int fd, uint64_t addr)
129 struct s_last_job *je, job;
132 Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr);
133 if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) {
136 if (read(fd, &num, sizeof(num)) != sizeof(num)) {
139 Dmsg1(100, "Read num_items=%d\n", num);
140 if (num > 4 * max_last_jobs) { /* sanity check */
143 for ( ; num; num--) {
144 if (read(fd, &job, sizeof(job)) != sizeof(job)) {
145 Dmsg1(000, "Read job entry. ERR=%s\n", strerror(errno));
149 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
150 memcpy((char *)je, (char *)&job, sizeof(job));
152 init_last_jobs_list();
154 last_jobs->append(je);
155 if (last_jobs->size() > max_last_jobs) {
156 je = (struct s_last_job *)last_jobs->first();
157 last_jobs->remove(je);
165 uint64_t write_last_jobs_list(int fd, uint64_t addr)
167 struct s_last_job *je;
170 Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr);
171 if (lseek(fd, (off_t)addr, SEEK_SET) < 0) {
175 /* First record is number of entires */
176 num = last_jobs->size();
177 if (write(fd, &num, sizeof(num)) != sizeof(num)) {
178 Dmsg1(000, "Error writing num_items: ERR=%s\n", strerror(errno));
181 foreach_dlist(je, last_jobs) {
182 if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) {
183 Dmsg1(000, "Error writing job: ERR=%s\n", strerror(errno));
188 /* Return current address */
189 ssize_t stat = lseek(fd, 0, SEEK_CUR);
197 void lock_last_jobs_list()
202 void unlock_last_jobs_list()
208 * Push a subroutine address into the job end callback stack
210 void job_end_push(JCR *jcr, void job_end_cb(JCR *jcr,void *), void *ctx)
212 jcr->job_end_push.append((void *)job_end_cb);
213 jcr->job_end_push.append(ctx);
216 /* Pop each job_end subroutine and call it */
217 static void job_end_pop(JCR *jcr)
219 void (*job_end_cb)(JCR *jcr, void *ctx);
221 for (int i=jcr->job_end_push.size()-1; i > 0; ) {
222 ctx = jcr->job_end_push.get(i--);
223 job_end_cb = (void (*)(JCR *,void *))jcr->job_end_push.get(i--);
224 job_end_cb(jcr, ctx);
229 * Create a Job Control Record and link it into JCR chain
230 * Returns newly allocated JCR
231 * Note, since each daemon has a different JCR, he passes
234 JCR *new_jcr(int size, JCR_free_HANDLER *daemon_free_jcr)
237 MQUEUE_ITEM *item = NULL;
238 struct sigaction sigtimer;
240 Dmsg0(3400, "Enter new_jcr\n");
241 jcr = (JCR *)malloc(size);
242 memset(jcr, 0, size);
243 jcr->my_thread_id = pthread_self();
244 jcr->msg_queue = New(dlist(item, &item->link));
245 jcr->job_end_push.init(1, false);
246 jcr->sched_time = time(NULL);
247 jcr->daemon_free_jcr = daemon_free_jcr; /* plug daemon free routine */
249 jcr->inc_use_count();
250 jcr->VolumeName = get_pool_memory(PM_FNAME);
251 jcr->VolumeName[0] = 0;
252 jcr->errmsg = get_pool_memory(PM_MESSAGE);
254 /* Setup some dummy values */
255 bstrncpy(jcr->Job, "*System*", sizeof(jcr->Job));
257 jcr->JobType = JT_SYSTEM; /* internal job until defined */
258 jcr->JobLevel = L_NONE;
259 set_jcr_job_status(jcr, JS_Created); /* ready to run */
261 sigtimer.sa_flags = 0;
262 sigtimer.sa_handler = timeout_handler;
263 sigfillset(&sigtimer.sa_mask);
264 sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
267 * Locking jobs is a global lock that is needed
268 * so that the Director can stop new jobs from being
269 * added to the jcr chain while it processes a new
270 * conf file and does the job_end_push().
275 jcrs = New(dlist(jcr, &jcr->link));
286 * Remove a JCR from the chain
287 * NOTE! The chain must be locked prior to calling
290 static void remove_jcr(JCR *jcr)
292 Dmsg0(3400, "Enter remove_jcr\n");
294 Emsg0(M_ABORT, 0, _("NULL jcr.\n"));
297 Dmsg0(3400, "Leave remove_jcr\n");
301 * Free stuff common to all JCRs. N.B. Be careful to include only
302 * generic stuff in the common part of the jcr.
304 static void free_common_jcr(JCR *jcr)
306 struct s_last_job *je, last_job;
308 /* Keep some statistics */
309 switch (jcr->JobType) {
317 last_job.Errors = jcr->Errors;
318 last_job.JobType = jcr->JobType;
319 last_job.JobId = jcr->JobId;
320 last_job.VolSessionId = jcr->VolSessionId;
321 last_job.VolSessionTime = jcr->VolSessionTime;
322 bstrncpy(last_job.Job, jcr->Job, sizeof(last_job.Job));
323 last_job.JobFiles = jcr->JobFiles;
324 last_job.JobBytes = jcr->JobBytes;
325 last_job.JobStatus = jcr->JobStatus;
326 last_job.JobLevel = jcr->JobLevel;
327 last_job.start_time = jcr->start_time;
328 last_job.end_time = time(NULL);
329 /* Keep list of last jobs, but not Console where JobId==0 */
330 if (last_job.JobId > 0) {
331 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
332 memcpy((char *)je, (char *)&last_job, sizeof(last_job));
334 init_last_jobs_list();
336 last_jobs->append(je);
337 if (last_jobs->size() > max_last_jobs) {
338 je = (struct s_last_job *)last_jobs->first();
339 last_jobs->remove(je);
347 jcr->destroy_mutex();
349 if (jcr->msg_queue) {
350 delete jcr->msg_queue;
351 jcr->msg_queue = NULL;
353 close_msg(jcr); /* close messages for this job */
355 /* do this after closing messages */
356 if (jcr->client_name) {
357 free_pool_memory(jcr->client_name);
358 jcr->client_name = NULL;
362 free_pool_memory(jcr->attr);
366 if (jcr->sd_auth_key) {
367 free(jcr->sd_auth_key);
368 jcr->sd_auth_key = NULL;
370 if (jcr->VolumeName) {
371 free_pool_memory(jcr->VolumeName);
372 jcr->VolumeName = NULL;
375 if (jcr->dir_bsock) {
376 bnet_close(jcr->dir_bsock);
377 jcr->dir_bsock = NULL;
380 free_pool_memory(jcr->errmsg);
387 if (jcr->RegexWhere) {
388 free(jcr->RegexWhere);
389 jcr->RegexWhere = NULL;
391 if (jcr->where_bregexp) {
392 free_bregexps(jcr->where_bregexp);
393 delete jcr->where_bregexp;
394 jcr->where_bregexp = NULL;
396 if (jcr->cached_path) {
397 free_pool_memory(jcr->cached_path);
398 jcr->cached_path = NULL;
401 free_getuser_cache();
402 free_getgroup_cache();
407 * Global routine to free a jcr
410 void b_free_jcr(const char *file, int line, JCR *jcr)
412 Dmsg3(3400, "Enter free_jcr 0x%x from %s:%d\n", jcr, file, line);
416 void free_jcr(JCR *jcr)
419 Dmsg2(3400, "Enter free_jcr 0x%x job=%d\n", jcr, jcr->JobId);
423 dequeue_messages(jcr);
425 jcr->dec_use_count(); /* decrement use count */
426 if (jcr->use_count() < 0) {
427 Emsg2(M_ERROR, 0, _("JCR use_count=%d JobId=%d\n"),
428 jcr->use_count(), jcr->JobId);
430 Dmsg3(3400, "Dec free_jcr 0x%x use_count=%d jobid=%d\n", jcr, jcr->use_count(), jcr->JobId);
431 if (jcr->use_count() > 0) { /* if in use */
433 Dmsg3(3400, "free_jcr 0x%x job=%d use_count=%d\n", jcr, jcr->JobId, jcr->use_count());
437 remove_jcr(jcr); /* remove Jcr from chain */
440 job_end_pop(jcr); /* pop and call hooked routines */
442 Dmsg1(3400, "End job=%d\n", jcr->JobId);
443 if (jcr->daemon_free_jcr) {
444 jcr->daemon_free_jcr(jcr); /* call daemon free routine */
446 free_common_jcr(jcr);
447 close_msg(NULL); /* flush any daemon messages */
448 garbage_collect_memory_pool();
449 Dmsg0(3400, "Exit free_jcr\n");
454 * Given a JobId, find the JCR
455 * Returns: jcr on success
458 JCR *get_jcr_by_id(uint32_t JobId)
463 if (jcr->JobId == JobId) {
464 jcr->inc_use_count();
465 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
474 * Given a SessionId and SessionTime, find the JCR
475 * Returns: jcr on success
478 JCR *get_jcr_by_session(uint32_t SessionId, uint32_t SessionTime)
483 if (jcr->VolSessionId == SessionId &&
484 jcr->VolSessionTime == SessionTime) {
485 jcr->inc_use_count();
486 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
496 * Given a Job, find the JCR
497 * compares on the number of characters in Job
498 * thus allowing partial matches.
499 * Returns: jcr on success
502 JCR *get_jcr_by_partial_name(char *Job)
512 if (strncmp(Job, jcr->Job, len) == 0) {
513 jcr->inc_use_count();
514 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
524 * Given a Job, find the JCR
525 * requires an exact match of names.
526 * Returns: jcr on success
529 JCR *get_jcr_by_full_name(char *Job)
537 if (strcmp(jcr->Job, Job) == 0) {
538 jcr->inc_use_count();
539 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
547 void set_jcr_job_status(JCR *jcr, int JobStatus)
550 * For a set of errors, ... keep the current status
551 * so it isn't lost. For all others, set it.
553 switch (jcr->JobStatus) {
554 case JS_ErrorTerminated:
561 jcr->JobStatus = JobStatus;
565 #ifdef TRACE_JCR_CHAIN
566 static int lock_count = 0;
572 #ifdef TRACE_JCR_CHAIN
573 static void b_lock_jcr_chain(const char *fname, int line)
575 static void lock_jcr_chain()
578 #ifdef TRACE_JCR_CHAIN
579 Dmsg3(3400, "Lock jcr chain %d from %s:%d\n", ++lock_count,
588 #ifdef TRACE_JCR_CHAIN
589 static void b_unlock_jcr_chain(const char *fname, int line)
591 static void unlock_jcr_chain()
594 #ifdef TRACE_JCR_CHAIN
595 Dmsg3(3400, "Unlock jcr chain %d from %s:%d\n", lock_count--,
603 * Start walk of jcr chain
604 * The proper way to walk the jcr chain is:
611 * It is possible to leave out the endeach_jcr(jcr), but
612 * in that case, the last jcr referenced must be explicitly
618 JCR *jcr_walk_start()
622 jcr = (JCR *)jcrs->first();
624 jcr->inc_use_count();
625 Dmsg3(3400, "Inc jcr_walk_start 0x%x job=%d use_count=%d\n", jcr,
626 jcr->JobId, jcr->use_count());
633 * Get next jcr from chain, and release current one
635 JCR *jcr_walk_next(JCR *prev_jcr)
640 jcr = (JCR *)jcrs->next(prev_jcr);
642 jcr->inc_use_count();
643 Dmsg3(3400, "Inc jcr_walk_next 0x%x job=%d use_count=%d\n", jcr,
644 jcr->JobId, jcr->use_count());
654 * Release last jcr referenced
656 void jcr_walk_end(JCR *jcr)
665 * Setup to call the timeout check routine every 30 seconds
666 * This routine will check any timers that have been enabled.
668 bool init_jcr_subsystem(void)
670 watchdog_t *wd = new_watchdog();
672 wd->one_shot = false;
673 wd->interval = 30; /* FIXME: should be configurable somewhere, even
674 if only with a #define */
675 wd->callback = jcr_timeout_check;
677 register_watchdog(wd);
682 static void jcr_timeout_check(watchdog_t *self)
688 Dmsg0(3400, "Start JCR timeout checks\n");
690 /* Walk through all JCRs checking if any one is
691 * blocked for more than specified max time.
694 Dmsg2(3400, "jcr_timeout_check JobId=%u jcr=0x%x\n", jcr->JobId, jcr);
695 if (jcr->JobId == 0) {
698 fd = jcr->store_bsock;
700 timer_start = fd->timer_start;
701 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
702 fd->timer_start = 0; /* turn off timer */
703 fd->timed_out = true;
704 Jmsg(jcr, M_ERROR, 0, _(
705 "Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
706 watchdog_time - timer_start);
707 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
710 fd = jcr->file_bsock;
712 timer_start = fd->timer_start;
713 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
714 fd->timer_start = 0; /* turn off timer */
715 fd->timed_out = true;
716 Jmsg(jcr, M_ERROR, 0, _(
717 "Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
718 watchdog_time - timer_start);
719 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
724 timer_start = fd->timer_start;
725 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
726 fd->timer_start = 0; /* turn off timer */
727 fd->timed_out = true;
728 Jmsg(jcr, M_ERROR, 0, _(
729 "Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
730 watchdog_time - timer_start);
731 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
737 Dmsg0(3400, "Finished JCR timeout checks\n");
741 * Timeout signal comes here
743 extern "C" void timeout_handler(int sig)
745 return; /* thus interrupting the function */