2 Bacula® - The Network Backup Solution
4 Copyright (C) 2000-2007 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version two of the GNU General Public
10 License as published by the Free Software Foundation and included
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of John Walker.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Manipulation routines for Job Control Records and
30 * handling of last_jobs_list.
32 * Kern E. Sibbald, December 2000
36 * These routines are thread safe.
38 * The job list routines were re-written in May 2005 to
39 * eliminate the global lock while traversing the list, and
40 * to use the dlist subroutines. The locking is now done
41 * on the list each time the list is modified or traversed.
42 * That is it is "micro-locked" rather than globally locked.
43 * The result is that there is one lock/unlock for each entry
44 * in the list while traversing it rather than a single lock
45 * at the beginning of a traversal and one at the end. This
46 * incurs slightly more overhead, but effectively eliminates
47 * the possibilty of race conditions. In addition, with the
48 * exception of the global locking of the list during the
49 * re-reading of the config file, no recursion is needed.
56 /* External variables we reference */
57 extern time_t watchdog_time;
59 /* External referenced functions */
60 void free_bregexps(alist *bregexps);
62 /* Forward referenced functions */
63 extern "C" void timeout_handler(int sig);
64 static void jcr_timeout_check(watchdog_t *self);
65 #ifdef TRACE_JCR_CHAIN
66 static void b_lock_jcr_chain(const char *filen, int line);
67 static void b_unlock_jcr_chain(const char *filen, int line);
68 #define lock_jcr_chain() b_lock_jcr_chain(__FILE__, __LINE__);
69 #define unlock_jcr_chain() b_unlock_jcr_chain(__FILE__, __LINE__);
71 static void lock_jcr_chain();
72 static void unlock_jcr_chain();
77 dlist *last_jobs = NULL;
78 const int max_last_jobs = 10;
80 static dlist *jcrs = NULL; /* JCR chain */
81 static pthread_mutex_t jcr_lock = PTHREAD_MUTEX_INITIALIZER;
83 static pthread_mutex_t job_start_mutex = PTHREAD_MUTEX_INITIALIZER;
85 static pthread_mutex_t last_jobs_mutex = PTHREAD_MUTEX_INITIALIZER;
98 void init_last_jobs_list()
101 struct s_last_job *job_entry = NULL;
103 last_jobs = New(dlist(job_entry, &job_entry->link));
106 jcrs = New(dlist(jcr, &jcr->link));
110 void term_last_jobs_list()
113 while (!last_jobs->empty()) {
114 void *je = last_jobs->first();
115 last_jobs->remove(je);
127 bool read_last_jobs_list(int fd, uint64_t addr)
129 struct s_last_job *je, job;
132 Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr);
133 if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) {
136 if (read(fd, &num, sizeof(num)) != sizeof(num)) {
139 Dmsg1(100, "Read num_items=%d\n", num);
140 if (num > 4 * max_last_jobs) { /* sanity check */
143 for ( ; num; num--) {
144 if (read(fd, &job, sizeof(job)) != sizeof(job)) {
146 Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror());
150 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
151 memcpy((char *)je, (char *)&job, sizeof(job));
153 init_last_jobs_list();
155 last_jobs->append(je);
156 if (last_jobs->size() > max_last_jobs) {
157 je = (struct s_last_job *)last_jobs->first();
158 last_jobs->remove(je);
166 uint64_t write_last_jobs_list(int fd, uint64_t addr)
168 struct s_last_job *je;
171 Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr);
172 if (lseek(fd, (off_t)addr, SEEK_SET) < 0) {
176 /* First record is number of entires */
177 num = last_jobs->size();
178 if (write(fd, &num, sizeof(num)) != sizeof(num)) {
180 Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror());
183 foreach_dlist(je, last_jobs) {
184 if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) {
186 Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror());
191 /* Return current address */
192 ssize_t stat = lseek(fd, 0, SEEK_CUR);
200 void lock_last_jobs_list()
205 void unlock_last_jobs_list()
211 * Push a subroutine address into the job end callback stack
213 void job_end_push(JCR *jcr, void job_end_cb(JCR *jcr,void *), void *ctx)
215 jcr->job_end_push.append((void *)job_end_cb);
216 jcr->job_end_push.append(ctx);
219 /* Pop each job_end subroutine and call it */
220 static void job_end_pop(JCR *jcr)
222 void (*job_end_cb)(JCR *jcr, void *ctx);
224 for (int i=jcr->job_end_push.size()-1; i > 0; ) {
225 ctx = jcr->job_end_push.get(i--);
226 job_end_cb = (void (*)(JCR *,void *))jcr->job_end_push.get(i--);
227 job_end_cb(jcr, ctx);
232 * Create a Job Control Record and link it into JCR chain
233 * Returns newly allocated JCR
234 * Note, since each daemon has a different JCR, he passes
237 JCR *new_jcr(int size, JCR_free_HANDLER *daemon_free_jcr)
240 MQUEUE_ITEM *item = NULL;
241 struct sigaction sigtimer;
243 Dmsg0(3400, "Enter new_jcr\n");
244 jcr = (JCR *)malloc(size);
245 memset(jcr, 0, size);
246 jcr->my_thread_id = pthread_self();
247 jcr->msg_queue = New(dlist(item, &item->link));
248 jcr->job_end_push.init(1, false);
249 jcr->sched_time = time(NULL);
250 jcr->daemon_free_jcr = daemon_free_jcr; /* plug daemon free routine */
252 jcr->inc_use_count();
253 jcr->VolumeName = get_pool_memory(PM_FNAME);
254 jcr->VolumeName[0] = 0;
255 jcr->errmsg = get_pool_memory(PM_MESSAGE);
257 /* Setup some dummy values */
258 bstrncpy(jcr->Job, "*System*", sizeof(jcr->Job));
260 jcr->JobType = JT_SYSTEM; /* internal job until defined */
261 jcr->JobLevel = L_NONE;
262 set_jcr_job_status(jcr, JS_Created); /* ready to run */
264 sigtimer.sa_flags = 0;
265 sigtimer.sa_handler = timeout_handler;
266 sigfillset(&sigtimer.sa_mask);
267 sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
270 * Locking jobs is a global lock that is needed
271 * so that the Director can stop new jobs from being
272 * added to the jcr chain while it processes a new
273 * conf file and does the job_end_push().
278 jcrs = New(dlist(jcr, &jcr->link));
289 * Remove a JCR from the chain
290 * NOTE! The chain must be locked prior to calling
293 static void remove_jcr(JCR *jcr)
295 Dmsg0(3400, "Enter remove_jcr\n");
297 Emsg0(M_ABORT, 0, _("NULL jcr.\n"));
300 Dmsg0(3400, "Leave remove_jcr\n");
304 * Free stuff common to all JCRs. N.B. Be careful to include only
305 * generic stuff in the common part of the jcr.
307 static void free_common_jcr(JCR *jcr)
309 struct s_last_job *je, last_job;
311 /* Keep some statistics */
312 switch (jcr->JobType) {
320 last_job.Errors = jcr->Errors;
321 last_job.JobType = jcr->JobType;
322 last_job.JobId = jcr->JobId;
323 last_job.VolSessionId = jcr->VolSessionId;
324 last_job.VolSessionTime = jcr->VolSessionTime;
325 bstrncpy(last_job.Job, jcr->Job, sizeof(last_job.Job));
326 last_job.JobFiles = jcr->JobFiles;
327 last_job.JobBytes = jcr->JobBytes;
328 last_job.JobStatus = jcr->JobStatus;
329 last_job.JobLevel = jcr->JobLevel;
330 last_job.start_time = jcr->start_time;
331 last_job.end_time = time(NULL);
332 /* Keep list of last jobs, but not Console where JobId==0 */
333 if (last_job.JobId > 0) {
334 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
335 memcpy((char *)je, (char *)&last_job, sizeof(last_job));
337 init_last_jobs_list();
339 last_jobs->append(je);
340 if (last_jobs->size() > max_last_jobs) {
341 je = (struct s_last_job *)last_jobs->first();
342 last_jobs->remove(je);
350 jcr->destroy_mutex();
352 if (jcr->msg_queue) {
353 delete jcr->msg_queue;
354 jcr->msg_queue = NULL;
356 close_msg(jcr); /* close messages for this job */
358 /* do this after closing messages */
359 if (jcr->client_name) {
360 free_pool_memory(jcr->client_name);
361 jcr->client_name = NULL;
365 free_pool_memory(jcr->attr);
369 if (jcr->sd_auth_key) {
370 free(jcr->sd_auth_key);
371 jcr->sd_auth_key = NULL;
373 if (jcr->VolumeName) {
374 free_pool_memory(jcr->VolumeName);
375 jcr->VolumeName = NULL;
378 if (jcr->dir_bsock) {
379 bnet_close(jcr->dir_bsock);
380 jcr->dir_bsock = NULL;
383 free_pool_memory(jcr->errmsg);
390 if (jcr->RegexWhere) {
391 free(jcr->RegexWhere);
392 jcr->RegexWhere = NULL;
394 if (jcr->where_bregexp) {
395 free_bregexps(jcr->where_bregexp);
396 delete jcr->where_bregexp;
397 jcr->where_bregexp = NULL;
399 if (jcr->cached_path) {
400 free_pool_memory(jcr->cached_path);
401 jcr->cached_path = NULL;
405 free_guid_list(jcr->id_list);
412 * Global routine to free a jcr
415 void b_free_jcr(const char *file, int line, JCR *jcr)
417 Dmsg3(3400, "Enter free_jcr 0x%x from %s:%d\n", jcr, file, line);
421 void free_jcr(JCR *jcr)
424 Dmsg2(3400, "Enter free_jcr 0x%x job=%d\n", jcr, jcr->JobId);
428 dequeue_messages(jcr);
430 jcr->dec_use_count(); /* decrement use count */
431 if (jcr->use_count() < 0) {
432 Emsg2(M_ERROR, 0, _("JCR use_count=%d JobId=%d\n"),
433 jcr->use_count(), jcr->JobId);
435 Dmsg3(3400, "Dec free_jcr 0x%x use_count=%d jobid=%d\n", jcr, jcr->use_count(), jcr->JobId);
436 if (jcr->use_count() > 0) { /* if in use */
438 Dmsg3(3400, "free_jcr 0x%x job=%d use_count=%d\n", jcr, jcr->JobId, jcr->use_count());
442 remove_jcr(jcr); /* remove Jcr from chain */
445 job_end_pop(jcr); /* pop and call hooked routines */
447 Dmsg1(3400, "End job=%d\n", jcr->JobId);
448 if (jcr->daemon_free_jcr) {
449 jcr->daemon_free_jcr(jcr); /* call daemon free routine */
451 free_common_jcr(jcr);
452 close_msg(NULL); /* flush any daemon messages */
453 garbage_collect_memory_pool();
454 Dmsg0(3400, "Exit free_jcr\n");
458 * Find which JobId corresponds to the current thread
460 uint32_t get_jobid_from_tid()
462 return get_jobid_from_tid(pthread_self());
465 uint32_t get_jobid_from_tid(pthread_t tid)
470 if (pthread_equal(jcr->my_thread_id, tid)) {
471 JobId = (uint32_t)jcr->JobId;
480 * Find the jcr that corresponds to the current thread
482 JCR *get_jcr_from_tid()
484 return get_jcr_from_tid(pthread_self());
487 JCR *get_jcr_from_tid(pthread_t tid)
493 if (pthread_equal(jcr->my_thread_id, tid)) {
505 * Given a JobId, find the JCR
506 * Returns: jcr on success
509 JCR *get_jcr_by_id(uint32_t JobId)
514 if (jcr->JobId == JobId) {
515 jcr->inc_use_count();
516 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
525 * Given a SessionId and SessionTime, find the JCR
526 * Returns: jcr on success
529 JCR *get_jcr_by_session(uint32_t SessionId, uint32_t SessionTime)
534 if (jcr->VolSessionId == SessionId &&
535 jcr->VolSessionTime == SessionTime) {
536 jcr->inc_use_count();
537 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
547 * Given a Job, find the JCR
548 * compares on the number of characters in Job
549 * thus allowing partial matches.
550 * Returns: jcr on success
553 JCR *get_jcr_by_partial_name(char *Job)
563 if (strncmp(Job, jcr->Job, len) == 0) {
564 jcr->inc_use_count();
565 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
575 * Given a Job, find the JCR
576 * requires an exact match of names.
577 * Returns: jcr on success
580 JCR *get_jcr_by_full_name(char *Job)
588 if (strcmp(jcr->Job, Job) == 0) {
589 jcr->inc_use_count();
590 Dmsg2(3400, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count());
598 void set_jcr_job_status(JCR *jcr, int JobStatus)
601 * For a set of errors, ... keep the current status
602 * so it isn't lost. For all others, set it.
604 Dmsg2(100, "OnEntry JobStatus=%c set=%c\n", jcr->JobStatus, JobStatus);
605 switch (jcr->JobStatus) {
606 case JS_ErrorTerminated:
613 case JS_ErrorTerminated:
616 /* Override more minor status */
617 jcr->JobStatus = JobStatus;
623 jcr->JobStatus = JobStatus;
625 Dmsg2(100, "OnExit JobStatus=%c set=%c\n", jcr->JobStatus, JobStatus);
628 #ifdef TRACE_JCR_CHAIN
629 static int lock_count = 0;
635 #ifdef TRACE_JCR_CHAIN
636 static void b_lock_jcr_chain(const char *fname, int line)
638 static void lock_jcr_chain()
641 #ifdef TRACE_JCR_CHAIN
642 Dmsg3(3400, "Lock jcr chain %d from %s:%d\n", ++lock_count, fname, line);
650 #ifdef TRACE_JCR_CHAIN
651 static void b_unlock_jcr_chain(const char *fname, int line)
653 static void unlock_jcr_chain()
656 #ifdef TRACE_JCR_CHAIN
657 Dmsg3(3400, "Unlock jcr chain %d from %s:%d\n", lock_count--, fname, line);
664 * Start walk of jcr chain
665 * The proper way to walk the jcr chain is:
672 * It is possible to leave out the endeach_jcr(jcr), but
673 * in that case, the last jcr referenced must be explicitly
679 JCR *jcr_walk_start()
683 jcr = (JCR *)jcrs->first();
685 jcr->inc_use_count();
686 Dmsg3(3400, "Inc jcr_walk_start 0x%x job=%d use_count=%d\n", jcr, jcr->JobId, jcr->use_count());
693 * Get next jcr from chain, and release current one
695 JCR *jcr_walk_next(JCR *prev_jcr)
700 jcr = (JCR *)jcrs->next(prev_jcr);
702 jcr->inc_use_count();
703 Dmsg3(3400, "Inc jcr_walk_next 0x%x job=%d use_count=%d\n", jcr, jcr->JobId, jcr->use_count());
713 * Release last jcr referenced
715 void jcr_walk_end(JCR *jcr)
724 * Setup to call the timeout check routine every 30 seconds
725 * This routine will check any timers that have been enabled.
727 bool init_jcr_subsystem(void)
729 watchdog_t *wd = new_watchdog();
731 wd->one_shot = false;
732 wd->interval = 30; /* FIXME: should be configurable somewhere, even
733 if only with a #define */
734 wd->callback = jcr_timeout_check;
736 register_watchdog(wd);
741 static void jcr_timeout_check(watchdog_t *self)
747 Dmsg0(3400, "Start JCR timeout checks\n");
749 /* Walk through all JCRs checking if any one is
750 * blocked for more than specified max time.
753 Dmsg2(3400, "jcr_timeout_check JobId=%u jcr=0x%x\n", jcr->JobId, jcr);
754 if (jcr->JobId == 0) {
757 fd = jcr->store_bsock;
759 timer_start = fd->timer_start;
760 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
761 fd->timer_start = 0; /* turn off timer */
763 Jmsg(jcr, M_ERROR, 0, _(
764 "Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
765 watchdog_time - timer_start);
766 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
769 fd = jcr->file_bsock;
771 timer_start = fd->timer_start;
772 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
773 fd->timer_start = 0; /* turn off timer */
775 Jmsg(jcr, M_ERROR, 0, _(
776 "Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
777 watchdog_time - timer_start);
778 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
783 timer_start = fd->timer_start;
784 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
785 fd->timer_start = 0; /* turn off timer */
787 Jmsg(jcr, M_ERROR, 0, _(
788 "Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
789 watchdog_time - timer_start);
790 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
796 Dmsg0(3400, "Finished JCR timeout checks\n");
800 * Timeout signal comes here
802 extern "C" void timeout_handler(int sig)
804 return; /* thus interrupting the function */