2 Bacula® - The Network Backup Solution
4 Copyright (C) 2000-2007 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version two of the GNU General Public
10 License as published by the Free Software Foundation and included
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of John Walker.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Manipulation routines for Job Control Records and
30 * handling of last_jobs_list.
32 * Kern E. Sibbald, December 2000
36 * These routines are thread safe.
38 * The job list routines were re-written in May 2005 to
39 * eliminate the global lock while traversing the list, and
40 * to use the dlist subroutines. The locking is now done
41 * on the list each time the list is modified or traversed.
42 * That is it is "micro-locked" rather than globally locked.
43 * The result is that there is one lock/unlock for each entry
44 * in the list while traversing it rather than a single lock
45 * at the beginning of a traversal and one at the end. This
46 * incurs slightly more overhead, but effectively eliminates
47 * the possibilty of race conditions. In addition, with the
48 * exception of the global locking of the list during the
49 * re-reading of the config file, no recursion is needed.
56 const int dbglvl = 3400;
58 /* External variables we reference */
59 extern time_t watchdog_time;
61 /* External referenced functions */
62 void free_bregexps(alist *bregexps);
64 /* Forward referenced functions */
65 extern "C" void timeout_handler(int sig);
66 static void jcr_timeout_check(watchdog_t *self);
67 #ifdef TRACE_JCR_CHAIN
68 static void b_lock_jcr_chain(const char *filen, int line);
69 static void b_unlock_jcr_chain(const char *filen, int line);
70 #define lock_jcr_chain() b_lock_jcr_chain(__FILE__, __LINE__);
71 #define unlock_jcr_chain() b_unlock_jcr_chain(__FILE__, __LINE__);
73 static void lock_jcr_chain();
74 static void unlock_jcr_chain();
79 dlist *last_jobs = NULL;
80 const int max_last_jobs = 10;
82 static dlist *jcrs = NULL; /* JCR chain */
83 static pthread_mutex_t jcr_lock = PTHREAD_MUTEX_INITIALIZER;
85 static pthread_mutex_t job_start_mutex = PTHREAD_MUTEX_INITIALIZER;
87 static pthread_mutex_t last_jobs_mutex = PTHREAD_MUTEX_INITIALIZER;
100 void init_last_jobs_list()
103 struct s_last_job *job_entry = NULL;
105 last_jobs = New(dlist(job_entry, &job_entry->link));
108 jcrs = New(dlist(jcr, &jcr->link));
112 void term_last_jobs_list()
115 while (!last_jobs->empty()) {
116 void *je = last_jobs->first();
117 last_jobs->remove(je);
129 bool read_last_jobs_list(int fd, uint64_t addr)
131 struct s_last_job *je, job;
134 Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr);
135 if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) {
138 if (read(fd, &num, sizeof(num)) != sizeof(num)) {
141 Dmsg1(100, "Read num_items=%d\n", num);
142 if (num > 4 * max_last_jobs) { /* sanity check */
145 for ( ; num; num--) {
146 if (read(fd, &job, sizeof(job)) != sizeof(job)) {
148 Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror());
152 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
153 memcpy((char *)je, (char *)&job, sizeof(job));
155 init_last_jobs_list();
157 last_jobs->append(je);
158 if (last_jobs->size() > max_last_jobs) {
159 je = (struct s_last_job *)last_jobs->first();
160 last_jobs->remove(je);
168 uint64_t write_last_jobs_list(int fd, uint64_t addr)
170 struct s_last_job *je;
173 Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr);
174 if (lseek(fd, (off_t)addr, SEEK_SET) < 0) {
178 /* First record is number of entires */
179 num = last_jobs->size();
180 if (write(fd, &num, sizeof(num)) != sizeof(num)) {
182 Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror());
185 foreach_dlist(je, last_jobs) {
186 if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) {
188 Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror());
193 /* Return current address */
194 ssize_t stat = lseek(fd, 0, SEEK_CUR);
202 void lock_last_jobs_list()
207 void unlock_last_jobs_list()
213 * Push a subroutine address into the job end callback stack
215 void job_end_push(JCR *jcr, void job_end_cb(JCR *jcr,void *), void *ctx)
217 jcr->job_end_push.append((void *)job_end_cb);
218 jcr->job_end_push.append(ctx);
221 /* Pop each job_end subroutine and call it */
222 static void job_end_pop(JCR *jcr)
224 void (*job_end_cb)(JCR *jcr, void *ctx);
226 for (int i=jcr->job_end_push.size()-1; i > 0; ) {
227 ctx = jcr->job_end_push.get(i--);
228 job_end_cb = (void (*)(JCR *,void *))jcr->job_end_push.get(i--);
229 job_end_cb(jcr, ctx);
234 * Create a Job Control Record and link it into JCR chain
235 * Returns newly allocated JCR
236 * Note, since each daemon has a different JCR, he passes
239 JCR *new_jcr(int size, JCR_free_HANDLER *daemon_free_jcr)
242 MQUEUE_ITEM *item = NULL;
243 struct sigaction sigtimer;
245 Dmsg0(dbglvl, "Enter new_jcr\n");
246 jcr = (JCR *)malloc(size);
247 memset(jcr, 0, size);
248 jcr->my_thread_id = pthread_self();
249 jcr->msg_queue = New(dlist(item, &item->link));
250 jcr->job_end_push.init(1, false);
251 jcr->sched_time = time(NULL);
252 jcr->daemon_free_jcr = daemon_free_jcr; /* plug daemon free routine */
254 jcr->inc_use_count();
255 jcr->VolumeName = get_pool_memory(PM_FNAME);
256 jcr->VolumeName[0] = 0;
257 jcr->errmsg = get_pool_memory(PM_MESSAGE);
259 /* Setup some dummy values */
260 bstrncpy(jcr->Job, "*System*", sizeof(jcr->Job));
262 jcr->JobType = JT_SYSTEM; /* internal job until defined */
263 jcr->JobLevel = L_NONE;
264 set_jcr_job_status(jcr, JS_Created); /* ready to run */
266 sigtimer.sa_flags = 0;
267 sigtimer.sa_handler = timeout_handler;
268 sigfillset(&sigtimer.sa_mask);
269 sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
272 * Locking jobs is a global lock that is needed
273 * so that the Director can stop new jobs from being
274 * added to the jcr chain while it processes a new
275 * conf file and does the job_end_push().
280 jcrs = New(dlist(jcr, &jcr->link));
291 * Remove a JCR from the chain
292 * NOTE! The chain must be locked prior to calling
295 static void remove_jcr(JCR *jcr)
297 Dmsg0(dbglvl, "Enter remove_jcr\n");
299 Emsg0(M_ABORT, 0, _("NULL jcr.\n"));
302 Dmsg0(dbglvl, "Leave remove_jcr\n");
306 * Free stuff common to all JCRs. N.B. Be careful to include only
307 * generic stuff in the common part of the jcr.
309 static void free_common_jcr(JCR *jcr)
311 struct s_last_job *je, last_job;
313 /* Keep some statistics */
314 switch (jcr->JobType) {
322 last_job.Errors = jcr->Errors;
323 last_job.JobType = jcr->JobType;
324 last_job.JobId = jcr->JobId;
325 last_job.VolSessionId = jcr->VolSessionId;
326 last_job.VolSessionTime = jcr->VolSessionTime;
327 bstrncpy(last_job.Job, jcr->Job, sizeof(last_job.Job));
328 last_job.JobFiles = jcr->JobFiles;
329 last_job.JobBytes = jcr->JobBytes;
330 last_job.JobStatus = jcr->JobStatus;
331 last_job.JobLevel = jcr->JobLevel;
332 last_job.start_time = jcr->start_time;
333 last_job.end_time = time(NULL);
334 /* Keep list of last jobs, but not Console where JobId==0 */
335 if (last_job.JobId > 0) {
336 je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
337 memcpy((char *)je, (char *)&last_job, sizeof(last_job));
339 init_last_jobs_list();
341 last_jobs->append(je);
342 if (last_jobs->size() > max_last_jobs) {
343 je = (struct s_last_job *)last_jobs->first();
344 last_jobs->remove(je);
352 jcr->destroy_mutex();
354 if (jcr->msg_queue) {
355 delete jcr->msg_queue;
356 jcr->msg_queue = NULL;
358 close_msg(jcr); /* close messages for this job */
360 /* do this after closing messages */
361 if (jcr->client_name) {
362 free_pool_memory(jcr->client_name);
363 jcr->client_name = NULL;
367 free_pool_memory(jcr->attr);
371 if (jcr->sd_auth_key) {
372 free(jcr->sd_auth_key);
373 jcr->sd_auth_key = NULL;
375 if (jcr->VolumeName) {
376 free_pool_memory(jcr->VolumeName);
377 jcr->VolumeName = NULL;
380 if (jcr->dir_bsock) {
381 bnet_close(jcr->dir_bsock);
382 jcr->dir_bsock = NULL;
385 free_pool_memory(jcr->errmsg);
392 if (jcr->RegexWhere) {
393 free(jcr->RegexWhere);
394 jcr->RegexWhere = NULL;
396 if (jcr->where_bregexp) {
397 free_bregexps(jcr->where_bregexp);
398 delete jcr->where_bregexp;
399 jcr->where_bregexp = NULL;
401 if (jcr->cached_path) {
402 free_pool_memory(jcr->cached_path);
403 jcr->cached_path = NULL;
407 free_guid_list(jcr->id_list);
414 * Global routine to free a jcr
417 void b_free_jcr(const char *file, int line, JCR *jcr)
419 Dmsg3(dbglvl, "Enter free_jcr jid=%u from %s:%d\n", jcr->JobId, file, line);
423 void free_jcr(JCR *jcr)
426 Dmsg3(dbglvl, "Enter free_jcr jid=%u use_count=%d Job=%s\n",
427 jcr->JobId, jcr->use_count(), jcr->Job);
431 dequeue_messages(jcr);
433 jcr->dec_use_count(); /* decrement use count */
434 if (jcr->use_count() < 0) {
435 Emsg2(M_ERROR, 0, _("JCR use_count=%d JobId=%d\n"),
436 jcr->use_count(), jcr->JobId);
438 if (jcr->JobId > 0) {
439 Dmsg3(dbglvl, "Dec free_jcr jid=%u use_count=%d Job=%s\n",
440 jcr->JobId, jcr->use_count(), jcr->Job);
442 if (jcr->use_count() > 0) { /* if in use */
446 if (jcr->JobId > 0) {
447 Dmsg3(dbglvl, "remove jcr jid=%u use_count=%d Job=%s\n",
448 jcr->JobId, jcr->use_count(), jcr->Job);
450 remove_jcr(jcr); /* remove Jcr from chain */
453 job_end_pop(jcr); /* pop and call hooked routines */
455 Dmsg1(dbglvl, "End job=%d\n", jcr->JobId);
456 if (jcr->daemon_free_jcr) {
457 jcr->daemon_free_jcr(jcr); /* call daemon free routine */
459 free_common_jcr(jcr);
460 close_msg(NULL); /* flush any daemon messages */
461 garbage_collect_memory_pool();
462 Dmsg0(dbglvl, "Exit free_jcr\n");
466 * Find which JobId corresponds to the current thread
468 uint32_t get_jobid_from_tid()
470 return get_jobid_from_tid(pthread_self());
473 uint32_t get_jobid_from_tid(pthread_t tid)
478 if (pthread_equal(jcr->my_thread_id, tid)) {
479 JobId = (uint32_t)jcr->JobId;
488 * Find the jcr that corresponds to the current thread
490 JCR *get_jcr_from_tid()
492 return get_jcr_from_tid(pthread_self());
495 JCR *get_jcr_from_tid(pthread_t tid)
501 if (pthread_equal(jcr->my_thread_id, tid)) {
513 * Given a JobId, find the JCR
514 * Returns: jcr on success
517 JCR *get_jcr_by_id(uint32_t JobId)
522 if (jcr->JobId == JobId) {
523 jcr->inc_use_count();
524 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
525 jcr->JobId, jcr->use_count(), jcr->Job);
534 * Given a SessionId and SessionTime, find the JCR
535 * Returns: jcr on success
538 JCR *get_jcr_by_session(uint32_t SessionId, uint32_t SessionTime)
543 if (jcr->VolSessionId == SessionId &&
544 jcr->VolSessionTime == SessionTime) {
545 jcr->inc_use_count();
546 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
547 jcr->JobId, jcr->use_count(), jcr->Job);
557 * Given a Job, find the JCR
558 * compares on the number of characters in Job
559 * thus allowing partial matches.
560 * Returns: jcr on success
563 JCR *get_jcr_by_partial_name(char *Job)
573 if (strncmp(Job, jcr->Job, len) == 0) {
574 jcr->inc_use_count();
575 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
576 jcr->JobId, jcr->use_count(), jcr->Job);
586 * Given a Job, find the JCR
587 * requires an exact match of names.
588 * Returns: jcr on success
591 JCR *get_jcr_by_full_name(char *Job)
599 if (strcmp(jcr->Job, Job) == 0) {
600 jcr->inc_use_count();
601 Dmsg3(dbglvl, "Inc get_jcr jid=%u use_count=%d Job=%s\n",
602 jcr->JobId, jcr->use_count(), jcr->Job);
610 void set_jcr_job_status(JCR *jcr, int JobStatus)
613 * For a set of errors, ... keep the current status
614 * so it isn't lost. For all others, set it.
616 Dmsg3(300, "jid=%u OnEntry JobStatus=%c set=%c\n", (uint32_t)jcr->JobId,
617 jcr->JobStatus, JobStatus);
618 switch (jcr->JobStatus) {
619 case JS_ErrorTerminated:
626 case JS_ErrorTerminated:
629 /* Override more minor status */
630 jcr->JobStatus = JobStatus;
635 jcr->JobStatus = JobStatus;
637 Dmsg3(100, "jid=%u OnExit JobStatus=%c set=%c\n", (uint32_t)jcr->JobId,
638 jcr->JobStatus, JobStatus);
641 #ifdef TRACE_JCR_CHAIN
642 static int lock_count = 0;
648 #ifdef TRACE_JCR_CHAIN
649 static void b_lock_jcr_chain(const char *fname, int line)
651 static void lock_jcr_chain()
654 #ifdef TRACE_JCR_CHAIN
655 Dmsg3(dbglvl, "Lock jcr chain %d from %s:%d\n", ++lock_count, fname, line);
663 #ifdef TRACE_JCR_CHAIN
664 static void b_unlock_jcr_chain(const char *fname, int line)
666 static void unlock_jcr_chain()
669 #ifdef TRACE_JCR_CHAIN
670 Dmsg3(dbglvl, "Unlock jcr chain %d from %s:%d\n", lock_count--, fname, line);
677 * Start walk of jcr chain
678 * The proper way to walk the jcr chain is:
685 * It is possible to leave out the endeach_jcr(jcr), but
686 * in that case, the last jcr referenced must be explicitly
692 JCR *jcr_walk_start()
696 jcr = (JCR *)jcrs->first();
698 jcr->inc_use_count();
699 if (jcr->JobId > 0) {
700 Dmsg3(dbglvl, "Inc walk_start jid=%u use_count=%d Job=%s\n",
701 jcr->JobId, jcr->use_count(), jcr->Job);
709 * Get next jcr from chain, and release current one
711 JCR *jcr_walk_next(JCR *prev_jcr)
716 jcr = (JCR *)jcrs->next(prev_jcr);
718 jcr->inc_use_count();
719 if (jcr->JobId > 0) {
720 Dmsg3(dbglvl, "Inc walk_next jid=%u use_count=%d Job=%s\n",
721 jcr->JobId, jcr->use_count(), jcr->Job);
732 * Release last jcr referenced
734 void jcr_walk_end(JCR *jcr)
737 if (jcr->JobId > 0) {
738 Dmsg3(dbglvl, "Free walk_end jid=%u use_count=%d Job=%s\n",
739 jcr->JobId, jcr->use_count(), jcr->Job);
747 * Setup to call the timeout check routine every 30 seconds
748 * This routine will check any timers that have been enabled.
750 bool init_jcr_subsystem(void)
752 watchdog_t *wd = new_watchdog();
754 wd->one_shot = false;
755 wd->interval = 30; /* FIXME: should be configurable somewhere, even
756 if only with a #define */
757 wd->callback = jcr_timeout_check;
759 register_watchdog(wd);
764 static void jcr_timeout_check(watchdog_t *self)
770 Dmsg0(dbglvl, "Start JCR timeout checks\n");
772 /* Walk through all JCRs checking if any one is
773 * blocked for more than specified max time.
776 Dmsg2(dbglvl, "jcr_timeout_check JobId=%u jcr=0x%x\n", jcr->JobId, jcr);
777 if (jcr->JobId == 0) {
780 fd = jcr->store_bsock;
782 timer_start = fd->timer_start;
783 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
784 fd->timer_start = 0; /* turn off timer */
786 Jmsg(jcr, M_ERROR, 0, _(
787 "Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
788 watchdog_time - timer_start);
789 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
792 fd = jcr->file_bsock;
794 timer_start = fd->timer_start;
795 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
796 fd->timer_start = 0; /* turn off timer */
798 Jmsg(jcr, M_ERROR, 0, _(
799 "Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
800 watchdog_time - timer_start);
801 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
806 timer_start = fd->timer_start;
807 if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
808 fd->timer_start = 0; /* turn off timer */
810 Jmsg(jcr, M_ERROR, 0, _(
811 "Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
812 watchdog_time - timer_start);
813 pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
819 Dmsg0(dbglvl, "Finished JCR timeout checks\n");
823 * Timeout signal comes here
825 extern "C" void timeout_handler(int sig)
827 return; /* thus interrupting the function */