2 Bacula® - The Network Backup Solution
4 Copyright (C) 2000-2008 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version two of the GNU General Public
10 License as published by the Free Software Foundation and included
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of Kern Sibbald.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Manipulation routines for Job Control Records and
30 * handling of last_jobs_list.
32 * Kern E. Sibbald, December 2000
36 * These routines are thread safe.
38 * The job list routines were re-written in May 2005 to
39 * eliminate the global lock while traversing the list, and
40 * to use the dlist subroutines. The locking is now done
41 * on the list each time the list is modified or traversed.
42 * That is it is "micro-locked" rather than globally locked.
43 * The result is that there is one lock/unlock for each entry
44 * in the list while traversing it rather than a single lock
45 * at the beginning of a traversal and one at the end. This
46 * incurs slightly more overhead, but effectively eliminates
47 * the possibilty of race conditions. In addition, with the
48 * exception of the global locking of the list during the
49 * re-reading of the config file, no recursion is needed.
56 const int dbglvl = 3400;
59 * Setting a NULL in tsd doesn't clear the tsd but instead tells
60 * pthreads not to call the tsd destructor. Consequently, we
61 * define this *invalid* jcr address and stuff it in the tsd
62 * when the jcr is no longer valid.
64 #define INVALID_JCR ((JCR *)(-1))
66 /* External variables we reference */
67 extern time_t watchdog_time;
69 /* External referenced functions */
70 void free_bregexps(alist *bregexps);
72 /* Forward referenced functions */
73 extern "C" void timeout_handler(int sig);
74 static void jcr_timeout_check(watchdog_t *self);
75 #ifdef TRACE_JCR_CHAIN
76 static void b_lock_jcr_chain(const char *filen, int line);
77 static void b_unlock_jcr_chain(const char *filen, int line);
78 #define lock_jcr_chain() b_lock_jcr_chain(__FILE__, __LINE__);
79 #define unlock_jcr_chain() b_unlock_jcr_chain(__FILE__, __LINE__);
81 static void lock_jcr_chain();
82 static void unlock_jcr_chain();
87 dlist *last_jobs = NULL;
88 const int max_last_jobs = 10;
90 static dlist *jcrs = NULL; /* JCR chain */
91 static pthread_mutex_t jcr_lock = PTHREAD_MUTEX_INITIALIZER;
93 static pthread_mutex_t job_start_mutex = PTHREAD_MUTEX_INITIALIZER;
95 static pthread_mutex_t last_jobs_mutex = PTHREAD_MUTEX_INITIALIZER;
97 static pthread_key_t jcr_key; /* Pointer to jcr for each thread */
99 pthread_once_t key_once = PTHREAD_ONCE_INIT;
112 void init_last_jobs_list()
115 struct s_last_job *job_entry = NULL;
117 last_jobs = New(dlist(job_entry, &job_entry->link));
120 jcrs = New(dlist(jcr, &jcr->link));
124 void term_last_jobs_list()
127 lock_last_jobs_list();
128 while (!last_jobs->empty()) {
129 void *je = last_jobs->first();
130 last_jobs->remove(je);
135 unlock_last_jobs_list();
143 bool read_last_jobs_list(int fd, uint64_t addr)
145 struct s_last_job *je, job;
149 Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr);
150 if (addr == 0 || lseek(fd, (boffset_t)addr, SEEK_SET) < 0) {
153 if (read(fd, &num, sizeof(num)) != sizeof(num)) {
156 Dmsg1(100, "Read num_items=%d\n", num);
157 if (num > 4 * max_last_jobs) { /* sanity check */
160 lock_last_jobs_list();
161 for ( ; num; num--) {
162 if (read(fd, &job, sizeof(job)) != sizeof(job)) {
164 Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror());
169 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
170 memcpy((char *)je, (char *)&job, sizeof(job));
172 init_last_jobs_list();
174 last_jobs->append(je);
175 if (last_jobs->size() > max_last_jobs) {
176 je = (struct s_last_job *)last_jobs->first();
177 last_jobs->remove(je);
182 unlock_last_jobs_list();
186 uint64_t write_last_jobs_list(int fd, uint64_t addr)
188 struct s_last_job *je;
192 Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr);
193 if (lseek(fd, (boffset_t)addr, SEEK_SET) < 0) {
197 lock_last_jobs_list();
198 /* First record is number of entires */
199 num = last_jobs->size();
200 if (write(fd, &num, sizeof(num)) != sizeof(num)) {
202 Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror());
205 foreach_dlist(je, last_jobs) {
206 if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) {
208 Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror());
212 unlock_last_jobs_list();
214 /* Return current address */
215 stat = lseek(fd, 0, SEEK_CUR);
222 unlock_last_jobs_list();
226 void lock_last_jobs_list()
231 void unlock_last_jobs_list()
236 /* Set Job type in JCR and also set appropriate read flag */
237 void JCR::set_JobType(int32_t JobType)
242 /* Set Job level in JCR and also set appropriate read flag */
243 void JCR::set_JobLevel(int32_t JobLevel)
245 m_JobLevel = JobLevel;
257 if (m_JobLevel == L_VIRTUAL_FULL) {
268 * Push a subroutine address into the job end callback stack
270 void job_end_push(JCR *jcr, void job_end_cb(JCR *jcr,void *), void *ctx)
272 jcr->job_end_push.append((void *)job_end_cb);
273 jcr->job_end_push.append(ctx);
276 /* Pop each job_end subroutine and call it */
277 static void job_end_pop(JCR *jcr)
279 void (*job_end_cb)(JCR *jcr, void *ctx);
281 for (int i=jcr->job_end_push.size()-1; i > 0; ) {
282 ctx = jcr->job_end_push.get(i--);
283 job_end_cb = (void (*)(JCR *,void *))jcr->job_end_push.get(i--);
284 job_end_cb(jcr, ctx);
288 void create_jcr_key()
290 int status = pthread_key_create(&jcr_key, NULL);
293 Jmsg1(NULL, M_ABORT, 0, _("pthread key create failed: ERR=%s\n"),
294 be.bstrerror(status));
299 * Create a Job Control Record and link it into JCR chain
300 * Returns newly allocated JCR
301 * Note, since each daemon has a different JCR, he passes
304 JCR *new_jcr(int size, JCR_free_HANDLER *daemon_free_jcr)
307 MQUEUE_ITEM *item = NULL;
308 struct sigaction sigtimer;
311 Dmsg0(dbglvl, "Enter new_jcr\n");
312 status = pthread_once(&key_once, create_jcr_key);
315 Jmsg1(NULL, M_ABORT, 0, _("pthread_once failed. ERR=%s\n"), be.bstrerror(status));
317 jcr = (JCR *)malloc(size);
318 memset(jcr, 0, size);
319 jcr->my_thread_id = pthread_self();
320 jcr->msg_queue = New(dlist(item, &item->link));
321 jcr->job_end_push.init(1, false);
322 jcr->sched_time = time(NULL);
323 jcr->daemon_free_jcr = daemon_free_jcr; /* plug daemon free routine */
325 jcr->inc_use_count();
326 jcr->VolumeName = get_pool_memory(PM_FNAME);
327 jcr->VolumeName[0] = 0;
328 jcr->errmsg = get_pool_memory(PM_MESSAGE);
330 /* Setup some dummy values */
331 bstrncpy(jcr->Job, "*System*", sizeof(jcr->Job));
333 jcr->set_JobType(JT_SYSTEM); /* internal job until defined */
334 jcr->set_JobLevel(L_NONE);
335 set_jcr_job_status(jcr, JS_Created); /* ready to run */
337 sigtimer.sa_flags = 0;
338 sigtimer.sa_handler = timeout_handler;
339 sigfillset(&sigtimer.sa_mask);
340 sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
343 * Locking jobs is a global lock that is needed
344 * so that the Director can stop new jobs from being
345 * added to the jcr chain while it processes a new
346 * conf file and does the job_end_push().
351 jcrs = New(dlist(jcr, &jcr->link));
362 * Remove a JCR from the chain
363 * NOTE! The chain must be locked prior to calling
366 static void remove_jcr(JCR *jcr)
368 Dmsg0(dbglvl, "Enter remove_jcr\n");
370 Emsg0(M_ABORT, 0, _("NULL jcr.\n"));
373 Dmsg0(dbglvl, "Leave remove_jcr\n");
377 * Free stuff common to all JCRs. N.B. Be careful to include only
378 * generic stuff in the common part of the jcr.
380 static void free_common_jcr(JCR *jcr)
382 jcr->destroy_mutex();
384 if (jcr->msg_queue) {
385 delete jcr->msg_queue;
386 jcr->msg_queue = NULL;
388 close_msg(jcr); /* close messages for this job */
390 /* do this after closing messages */
391 if (jcr->client_name) {
392 free_pool_memory(jcr->client_name);
393 jcr->client_name = NULL;
397 free_pool_memory(jcr->attr);
401 if (jcr->sd_auth_key) {
402 free(jcr->sd_auth_key);
403 jcr->sd_auth_key = NULL;
405 if (jcr->VolumeName) {
406 free_pool_memory(jcr->VolumeName);
407 jcr->VolumeName = NULL;
410 if (jcr->dir_bsock) {
411 bnet_close(jcr->dir_bsock);
412 jcr->dir_bsock = NULL;
415 free_pool_memory(jcr->errmsg);
422 if (jcr->RegexWhere) {
423 free(jcr->RegexWhere);
424 jcr->RegexWhere = NULL;
426 if (jcr->where_bregexp) {
427 free_bregexps(jcr->where_bregexp);
428 delete jcr->where_bregexp;
429 jcr->where_bregexp = NULL;
431 if (jcr->cached_path) {
432 free_pool_memory(jcr->cached_path);
433 jcr->cached_path = NULL;
437 free_guid_list(jcr->id_list);
440 /* Invalidate the tsd jcr data */
441 set_jcr_in_tsd(INVALID_JCR);
446 * Global routine to free a jcr
449 void b_free_jcr(const char *file, int line, JCR *jcr)
451 struct s_last_job *je;
453 Dmsg3(dbglvl, "Enter free_jcr jid=%u from %s:%d\n", jcr->JobId, file, line);
457 void free_jcr(JCR *jcr)
459 struct s_last_job *je;
461 Dmsg3(dbglvl, "Enter free_jcr jid=%u use_count=%d Job=%s\n",
462 jcr->JobId, jcr->use_count(), jcr->Job);
466 dequeue_messages(jcr);
468 jcr->dec_use_count(); /* decrement use count */
469 if (jcr->use_count() < 0) {
470 Jmsg2(jcr, M_ERROR, 0, _("JCR use_count=%d JobId=%d\n"),
471 jcr->use_count(), jcr->JobId);
473 if (jcr->JobId > 0) {
474 Dmsg3(dbglvl, "Dec free_jcr jid=%u use_count=%d Job=%s\n",
475 jcr->JobId, jcr->use_count(), jcr->Job);
477 if (jcr->use_count() > 0) { /* if in use */
481 if (jcr->JobId > 0) {
482 Dmsg3(dbglvl, "remove jcr jid=%u use_count=%d Job=%s\n",
483 jcr->JobId, jcr->use_count(), jcr->Job);
485 remove_jcr(jcr); /* remove Jcr from chain */
487 job_end_pop(jcr); /* pop and call hooked routines */
489 Dmsg1(dbglvl, "End job=%d\n", jcr->JobId);
491 /* Keep some statistics */
492 switch (jcr->get_JobType()) {
499 /* Keep list of last jobs, but not Console where JobId==0 */
500 if (jcr->JobId > 0) {
501 lock_last_jobs_list();
503 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
504 memset(je, 0, sizeof(struct s_last_job)); /* zero in case unset fields */
505 je->Errors = jcr->Errors;
506 je->JobType = jcr->get_JobType();
507 je->JobId = jcr->JobId;
508 je->VolSessionId = jcr->VolSessionId;
509 je->VolSessionTime = jcr->VolSessionTime;
510 bstrncpy(je->Job, jcr->Job, sizeof(je->Job));
511 je->JobFiles = jcr->JobFiles;
512 je->JobBytes = jcr->JobBytes;
513 je->JobStatus = jcr->JobStatus;
514 je->JobLevel = jcr->get_JobLevel();
515 je->start_time = jcr->start_time;
516 je->end_time = time(NULL);
519 init_last_jobs_list();
521 last_jobs->append(je);
522 if (last_jobs->size() > max_last_jobs) {
523 je = (struct s_last_job *)last_jobs->first();
524 last_jobs->remove(je);
527 unlock_last_jobs_list();
534 if (jcr->daemon_free_jcr) {
535 jcr->daemon_free_jcr(jcr); /* call daemon free routine */
539 free_common_jcr(jcr);
540 close_msg(NULL); /* flush any daemon messages */
541 garbage_collect_memory_pool();
542 Dmsg0(dbglvl, "Exit free_jcr\n");
545 void set_jcr_in_tsd(JCR *jcr)
547 int status = pthread_setspecific(jcr_key, (void *)jcr);
550 Jmsg1(jcr, M_ABORT, 0, _("pthread_setspecific failed: ERR=%s\n"), be.bstrerror(status));
554 JCR *get_jcr_from_tsd()
556 JCR *jcr = (JCR *)pthread_getspecific(jcr_key);
557 // printf("get_jcr_from_tsd: jcr=%p\n", jcr);
558 /* set any INVALID_JCR to NULL which the rest of Bacula understands */
559 if (jcr == INVALID_JCR) {
567 * Find which JobId corresponds to the current thread
569 uint32_t get_jobid_from_tsd()
573 jcr = get_jcr_from_tsd();
574 // printf("get_jobid_from_tsr: jcr=%p\n", jcr);
576 JobId = (uint32_t)jcr->JobId;
582 * Given a JobId, find the JCR
583 * Returns: jcr on success
586 JCR *get_jcr_by_id(uint32_t JobId)
591 if (jcr->JobId == JobId) {
592 jcr->inc_use_count();
593 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
594 jcr->JobId, jcr->use_count(), jcr->Job);
603 * Given a SessionId and SessionTime, find the JCR
604 * Returns: jcr on success
607 JCR *get_jcr_by_session(uint32_t SessionId, uint32_t SessionTime)
612 if (jcr->VolSessionId == SessionId &&
613 jcr->VolSessionTime == SessionTime) {
614 jcr->inc_use_count();
615 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
616 jcr->JobId, jcr->use_count(), jcr->Job);
626 * Given a Job, find the JCR
627 * compares on the number of characters in Job
628 * thus allowing partial matches.
629 * Returns: jcr on success
632 JCR *get_jcr_by_partial_name(char *Job)
642 if (strncmp(Job, jcr->Job, len) == 0) {
643 jcr->inc_use_count();
644 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
645 jcr->JobId, jcr->use_count(), jcr->Job);
655 * Given a Job, find the JCR
656 * requires an exact match of names.
657 * Returns: jcr on success
660 JCR *get_jcr_by_full_name(char *Job)
668 if (strcmp(jcr->Job, Job) == 0) {
669 jcr->inc_use_count();
670 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
671 jcr->JobId, jcr->use_count(), jcr->Job);
679 void set_jcr_job_status(JCR *jcr, int JobStatus)
681 bool set_waittime=false;
682 Dmsg2(800, "set_jcr_job_status(%s, %c)\n", jcr->Job, JobStatus);
683 /* if wait state is new, we keep current time for watchdog MaxWaitTime */
689 case JS_WaitStoreRes:
691 case JS_WaitClientRes:
693 case JS_WaitPriority:
700 * For a set of errors, ... keep the current status
701 * so it isn't lost. For all others, set it.
703 Dmsg3(300, "jid=%u OnEntry JobStatus=%c set=%c\n", (uint32_t)jcr->JobId,
704 jcr->JobStatus, JobStatus);
705 switch (jcr->JobStatus) {
706 case JS_ErrorTerminated:
713 case JS_ErrorTerminated:
716 /* Override more minor status */
717 jcr->JobStatus = JobStatus;
723 * For a set of Wait situation, keep old time.
729 case JS_WaitStoreRes:
731 case JS_WaitClientRes:
733 case JS_WaitPriority:
734 set_waittime = false; /* keep old time */
736 jcr->JobStatus = JobStatus;
738 /* set it before JobStatus */
739 Dmsg0(800, "Setting wait_time\n");
740 jcr->wait_time = time(NULL);
743 Dmsg3(200, "jid=%u leave set_jcr_job_status=%c set=%c\n", (uint32_t)jcr->JobId,
744 jcr->JobStatus, JobStatus);
747 #ifdef TRACE_JCR_CHAIN
748 static int lock_count = 0;
754 #ifdef TRACE_JCR_CHAIN
755 static void b_lock_jcr_chain(const char *fname, int line)
757 static void lock_jcr_chain()
760 #ifdef TRACE_JCR_CHAIN
761 Dmsg3(dbglvl, "Lock jcr chain %d from %s:%d\n", ++lock_count, fname, line);
769 #ifdef TRACE_JCR_CHAIN
770 static void b_unlock_jcr_chain(const char *fname, int line)
772 static void unlock_jcr_chain()
775 #ifdef TRACE_JCR_CHAIN
776 Dmsg3(dbglvl, "Unlock jcr chain %d from %s:%d\n", lock_count--, fname, line);
783 * Start walk of jcr chain
784 * The proper way to walk the jcr chain is:
791 * It is possible to leave out the endeach_jcr(jcr), but
792 * in that case, the last jcr referenced must be explicitly
798 JCR *jcr_walk_start()
802 jcr = (JCR *)jcrs->first();
804 jcr->inc_use_count();
805 if (jcr->JobId > 0) {
806 Dmsg3(dbglvl, "Inc walk_start jid=%u use_count=%d Job=%s\n",
807 jcr->JobId, jcr->use_count(), jcr->Job);
815 * Get next jcr from chain, and release current one
817 JCR *jcr_walk_next(JCR *prev_jcr)
822 jcr = (JCR *)jcrs->next(prev_jcr);
824 jcr->inc_use_count();
825 if (jcr->JobId > 0) {
826 Dmsg3(dbglvl, "Inc walk_next jid=%u use_count=%d Job=%s\n",
827 jcr->JobId, jcr->use_count(), jcr->Job);
838 * Release last jcr referenced
840 void jcr_walk_end(JCR *jcr)
843 if (jcr->JobId > 0) {
844 Dmsg3(dbglvl, "Free walk_end jid=%u use_count=%d Job=%s\n",
845 jcr->JobId, jcr->use_count(), jcr->Job);
853 * Setup to call the timeout check routine every 30 seconds
854 * This routine will check any timers that have been enabled.
856 bool init_jcr_subsystem(void)
858 watchdog_t *wd = new_watchdog();
860 wd->one_shot = false;
861 wd->interval = 30; /* FIXME: should be configurable somewhere, even
862 if only with a #define */
863 wd->callback = jcr_timeout_check;
865 register_watchdog(wd);
870 static void jcr_timeout_check(watchdog_t *self)
876 Dmsg0(dbglvl, "Start JCR timeout checks\n");
878 /* Walk through all JCRs checking if any one is
879 * blocked for more than specified max time.
882 Dmsg2(dbglvl, "jcr_timeout_check JobId=%u jcr=0x%x\n", jcr->JobId, jcr);
883 if (jcr->JobId == 0) {
886 bs = jcr->store_bsock;
888 timer_start = bs->timer_start;
889 if (timer_start && (watchdog_time - timer_start) > bs->timeout) {
890 bs->timer_start = 0; /* turn off timer */
892 Qmsg(jcr, M_ERROR, 0, _(
893 "Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
894 watchdog_time - timer_start);
895 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
898 bs = jcr->file_bsock;
900 timer_start = bs->timer_start;
901 if (timer_start && (watchdog_time - timer_start) > bs->timeout) {
902 bs->timer_start = 0; /* turn off timer */
904 Qmsg(jcr, M_ERROR, 0, _(
905 "Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
906 watchdog_time - timer_start);
907 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
912 timer_start = bs->timer_start;
913 if (timer_start && (watchdog_time - timer_start) > bs->timeout) {
914 bs->timer_start = 0; /* turn off timer */
916 Qmsg(jcr, M_ERROR, 0, _(
917 "Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
918 watchdog_time - timer_start);
919 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
925 Dmsg0(dbglvl, "Finished JCR timeout checks\n");
929 * Timeout signal comes here
931 extern "C" void timeout_handler(int sig)
933 return; /* thus interrupting the function */