3 * Bacula Director Job processing routines
5 * Kern Sibbald, October MM
10 Copyright (C) 2000-2005 Kern Sibbald
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of
15 the License, or (at your option) any later version.
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public
23 License along with this program; if not, write to the Free
24 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
32 /* Forward referenced subroutines */
33 static void *job_thread(void *arg);
34 static void job_monitor_watchdog(watchdog_t *self);
35 static void job_monitor_destructor(watchdog_t *self);
36 static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr);
37 static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr);
39 /* Exported subroutines */
41 /* Imported subroutines */
42 extern void term_scheduler();
43 extern void term_ua_server();
44 extern int do_backup(JCR *jcr);
45 extern bool do_mac(JCR *jcr);
46 extern int do_admin(JCR *jcr);
47 extern int do_restore(JCR *jcr);
48 extern bool do_verify(JCR *jcr);
50 /* Imported variables */
51 extern time_t watchdog_time;
55 void init_job_server(int max_workers)
60 if ((stat = jobq_init(&job_queue, max_workers, job_thread)) != 0) {
62 Emsg1(M_ABORT, 0, _("Could not init job queue: ERR=%s\n"), be.strerror(stat));
64 if ((wd = new_watchdog()) == NULL) {
65 Emsg0(M_ABORT, 0, _("Could not init job monitor watchdogs\n"));
67 wd->callback = job_monitor_watchdog;
68 wd->destructor = job_monitor_destructor;
71 wd->data = new_control_jcr("*JobMonitor*", JT_SYSTEM);
72 register_watchdog(wd);
75 void term_job_server()
77 jobq_destroy(&job_queue); /* ignore any errors */
81 * Run a job -- typically called by the scheduler, but may also
82 * be called by the UA (Console program).
84 * Returns: 0 on failure
88 JobId_t run_job(JCR *jcr)
94 sm_check(__FILE__, __LINE__, true);
95 init_msg(jcr, jcr->messages);
97 /* Initialize termination condition variable */
98 if ((errstat = pthread_cond_init(&jcr->term_wait, NULL)) != 0) {
100 Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), be.strerror(errstat));
103 jcr->term_wait_inited = true;
108 Dmsg0(50, "Open database\n");
109 jcr->db=db_init_database(jcr, jcr->catalog->db_name, jcr->catalog->db_user,
110 jcr->catalog->db_password, jcr->catalog->db_address,
111 jcr->catalog->db_port, jcr->catalog->db_socket,
112 jcr->catalog->mult_db_connections);
113 if (!jcr->db || !db_open_database(jcr, jcr->db)) {
114 Jmsg(jcr, M_FATAL, 0, _("Could not open database \"%s\".\n"),
115 jcr->catalog->db_name);
117 Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db));
121 Dmsg0(50, "DB opened\n");
126 create_unique_job_name(jcr, jcr->job->hdr.name);
127 set_jcr_job_status(jcr, JS_Created);
128 init_jcr_job_record(jcr);
129 if (!db_create_job_record(jcr, jcr->db, &jcr->jr)) {
130 Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db));
133 JobId = jcr->JobId = jcr->jr.JobId;
135 Dmsg4(100, "Created job record JobId=%d Name=%s Type=%c Level=%c\n",
136 jcr->JobId, jcr->Job, jcr->jr.JobType, jcr->jr.JobLevel);
138 if (!get_or_create_client_record(jcr)) {
143 jcr->fname = get_pool_memory(PM_FNAME);
146 Dmsg0(200, "Add jrc to work queue\n");
148 /* Queue the job to be run */
149 if ((stat = jobq_add(&job_queue, jcr)) != 0) {
151 Jmsg(jcr, M_FATAL, 0, _("Could not add job queue: ERR=%s\n"), be.strerror(stat));
155 Dmsg0(100, "Done run_job()\n");
161 set_jcr_job_status(jcr, JS_ErrorTerminated);
169 * This is the engine called by jobq.c:jobq_add() when we were pulled
170 * from the work queue.
171 * At this point, we are running in our own thread and all
172 * necessary resources are allocated -- see jobq.c
174 static void *job_thread(void *arg)
176 JCR *jcr = (JCR *)arg;
178 jcr->my_thread_id = pthread_self();
179 pthread_detach(jcr->my_thread_id);
180 sm_check(__FILE__, __LINE__, true);
183 Dmsg0(200, "=====Start Job=========\n");
184 jcr->start_time = time(NULL); /* set the real start time */
185 jcr->jr.StartTime = jcr->start_time;
186 set_jcr_job_status(jcr, JS_Running);
187 if (!db_update_job_start_record(jcr, jcr->db, &jcr->jr)) {
188 Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db));
191 if (job_canceled(jcr)) {
192 update_job_end_record(jcr);
193 } else if (jcr->job->MaxStartDelay != 0 && jcr->job->MaxStartDelay <
194 (utime_t)(jcr->start_time - jcr->sched_time)) {
195 Jmsg(jcr, M_FATAL, 0, _("Job canceled because max start delay time exceeded.\n"));
196 set_jcr_job_status(jcr, JS_Canceled);
197 update_job_end_record(jcr);
201 generate_event(jcr, "StartJob");
202 if (jcr->job->RunBeforeJob) {
203 POOLMEM *before = get_pool_memory(PM_FNAME);
206 char line[MAXSTRING];
208 before = edit_job_codes(jcr, before, jcr->job->RunBeforeJob, "");
209 bpipe = open_bpipe(before, 0, "r");
210 free_pool_memory(before);
211 while (fgets(line, sizeof(line), bpipe->rfd)) {
212 Jmsg(jcr, M_INFO, 0, _("RunBefore: %s"), line);
214 status = close_bpipe(bpipe);
217 Jmsg(jcr, M_FATAL, 0, _("RunBeforeJob error: ERR=%s\n"), be.strerror(status));
218 set_jcr_job_status(jcr, JS_FatalError);
219 update_job_end_record(jcr);
223 switch (jcr->JobType) {
226 if (jcr->JobStatus == JS_Terminated) {
232 if (jcr->JobStatus == JS_Terminated) {
238 if (jcr->JobStatus == JS_Terminated) {
244 if (jcr->JobStatus == JS_Terminated) {
251 do_mac(jcr); /* migration, archive, copy */
252 if (jcr->JobStatus == JS_Terminated) {
257 Pmsg1(0, "Unimplemented job type: %d\n", jcr->JobType);
260 if ((jcr->job->RunAfterJob && jcr->JobStatus == JS_Terminated) ||
261 (jcr->job->RunAfterFailedJob && jcr->JobStatus != JS_Terminated)) {
262 POOLMEM *after = get_pool_memory(PM_FNAME);
265 char line[MAXSTRING];
267 if (jcr->JobStatus == JS_Terminated) {
268 after = edit_job_codes(jcr, after, jcr->job->RunAfterJob, "");
270 after = edit_job_codes(jcr, after, jcr->job->RunAfterFailedJob, "");
272 bpipe = open_bpipe(after, 0, "r");
273 free_pool_memory(after);
274 while (fgets(line, sizeof(line), bpipe->rfd)) {
275 Jmsg(jcr, M_INFO, 0, _("RunAfter: %s"), line);
277 status = close_bpipe(bpipe);
279 * Note, if we get an error here, do not mark the
280 * job in error, simply report the error condition.
284 if (jcr->JobStatus == JS_Terminated) {
285 Jmsg(jcr, M_WARNING, 0, _("RunAfterJob error: ERR=%s\n"), be.strerror(status));
287 Jmsg(jcr, M_FATAL, 0, _("RunAfterFailedJob error: ERR=%s\n"), be.strerror(status));
291 generate_event(jcr, "EndJob");
292 /* Send off any queued messages */
293 if (jcr->msg_queue->size() > 0) {
294 dequeue_messages(jcr);
301 Dmsg1(50, "======== End Job stat=%c ==========\n", jcr->JobStatus);
302 sm_check(__FILE__, __LINE__, true);
308 * Cancel a job -- typically called by the UA (Console program), but may also
309 * be called by the job watchdog.
311 * Returns: 1 if cancel appears to be successful
312 * 0 on failure. Message sent to ua->jcr.
314 int cancel_job(UAContext *ua, JCR *jcr)
318 switch (jcr->JobStatus) {
321 case JS_WaitClientRes:
322 case JS_WaitStoreRes:
323 case JS_WaitPriority:
325 case JS_WaitStartTime:
326 set_jcr_job_status(jcr, JS_Canceled);
327 bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"),
328 jcr->JobId, jcr->Job);
329 jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */
333 set_jcr_job_status(jcr, JS_Canceled);
335 /* Cancel File daemon */
336 if (jcr->file_bsock) {
337 ua->jcr->client = jcr->client;
338 if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) {
339 bsendmsg(ua, _("Failed to connect to File daemon.\n"));
342 Dmsg0(200, "Connected to file daemon\n");
343 fd = ua->jcr->file_bsock;
344 bnet_fsend(fd, "cancel Job=%s\n", jcr->Job);
345 while (bnet_recv(fd) >= 0) {
346 bsendmsg(ua, "%s", fd->msg);
348 bnet_sig(fd, BNET_TERMINATE);
350 ua->jcr->file_bsock = NULL;
353 /* Cancel Storage daemon */
354 if (jcr->store_bsock) {
355 if (!ua->jcr->storage) {
356 copy_storage(ua->jcr, jcr);
358 set_storage(ua->jcr, jcr->store);
360 if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) {
361 bsendmsg(ua, _("Failed to connect to Storage daemon.\n"));
364 Dmsg0(200, "Connected to storage daemon\n");
365 sd = ua->jcr->store_bsock;
366 bnet_fsend(sd, "cancel Job=%s\n", jcr->Job);
367 while (bnet_recv(sd) >= 0) {
368 bsendmsg(ua, "%s", sd->msg);
370 bnet_sig(sd, BNET_TERMINATE);
372 ua->jcr->store_bsock = NULL;
380 static void job_monitor_destructor(watchdog_t *self)
382 JCR *control_jcr = (JCR *) self->data;
384 free_jcr(control_jcr);
387 static void job_monitor_watchdog(watchdog_t *self)
389 JCR *control_jcr, *jcr;
391 control_jcr = (JCR *)self->data;
393 Dmsg1(800, "job_monitor_watchdog %p called\n", self);
400 if (jcr->JobId == 0) {
401 Dmsg2(800, "Skipping JCR %p (%s) with JobId 0\n",
403 /* Keep reference counts correct */
404 free_locked_jcr(jcr);
408 /* check MaxWaitTime */
409 cancel = job_check_maxwaittime(control_jcr, jcr);
411 /* check MaxRunTime */
412 cancel |= job_check_maxruntime(control_jcr, jcr);
415 Dmsg3(800, "Cancelling JCR %p jobid %d (%s)\n",
416 jcr, jcr->JobId, jcr->Job);
418 UAContext *ua = new_ua_context(jcr);
419 ua->jcr = control_jcr;
423 Dmsg1(800, "Have cancelled JCR %p\n", jcr);
426 /* Keep reference counts correct */
427 free_locked_jcr(jcr);
433 * Check if the maxwaittime has expired and it is possible
436 static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr)
440 if (jcr->job->MaxWaitTime == 0) {
443 if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) {
444 Dmsg3(800, "Job %p (%s) with MaxWaitTime %d not expired\n",
445 jcr, jcr->Job, jcr->job->MaxWaitTime);
448 Dmsg3(800, "Job %d (%s): MaxWaitTime of %d seconds exceeded, "
450 jcr->JobId, jcr->Job, jcr->job->MaxWaitTime);
451 switch (jcr->JobStatus) {
456 case JS_WaitStoreRes:
457 case JS_WaitClientRes:
459 case JS_WaitPriority:
461 case JS_WaitStartTime:
463 Dmsg0(200, "JCR blocked in #1\n");
466 Dmsg0(800, "JCR running, checking SD status\n");
467 switch (jcr->SDJobStatus) {
472 Dmsg0(800, "JCR blocked in #2\n");
475 Dmsg0(800, "JCR not blocked in #2\n");
480 case JS_ErrorTerminated:
483 Dmsg0(800, "JCR already dead in #3\n");
486 Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"),
489 Dmsg3(800, "MaxWaitTime result: %scancel JCR %p (%s)\n",
490 cancel ? "" : "do not ", jcr, jcr->job);
496 * Check if maxruntime has expired and if the job can be
499 static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr)
503 if (jcr->job->MaxRunTime == 0) {
506 if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) {
507 Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n",
508 jcr, jcr->Job, jcr->job->MaxRunTime);
512 switch (jcr->JobStatus) {
518 case JS_WaitStoreRes:
519 case JS_WaitClientRes:
521 case JS_WaitPriority:
523 case JS_WaitStartTime:
528 case JS_ErrorTerminated:
534 Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"),
538 Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n",
539 cancel ? "" : "do not ", jcr, jcr->job);
546 * Get or create a Client record for this Job
548 bool get_or_create_client_record(JCR *jcr)
552 memset(&cr, 0, sizeof(cr));
553 bstrncpy(cr.Name, jcr->client->hdr.name, sizeof(cr.Name));
554 cr.AutoPrune = jcr->client->AutoPrune;
555 cr.FileRetention = jcr->client->FileRetention;
556 cr.JobRetention = jcr->client->JobRetention;
557 if (!jcr->client_name) {
558 jcr->client_name = get_pool_memory(PM_NAME);
560 pm_strcpy(jcr->client_name, jcr->client->hdr.name);
561 if (!db_create_client_record(jcr, jcr->db, &cr)) {
562 Jmsg(jcr, M_FATAL, 0, _("Could not create Client record. ERR=%s\n"),
563 db_strerror(jcr->db));
566 jcr->jr.ClientId = cr.ClientId;
568 if (!jcr->client_uname) {
569 jcr->client_uname = get_pool_memory(PM_NAME);
571 pm_strcpy(jcr->client_uname, cr.Uname);
573 Dmsg2(100, "Created Client %s record %d\n", jcr->client->hdr.name,
578 bool get_or_create_fileset_record(JCR *jcr, FILESET_DBR *fsr)
581 * Get or Create FileSet record
583 memset(fsr, 0, sizeof(FILESET_DBR));
584 bstrncpy(fsr->FileSet, jcr->fileset->hdr.name, sizeof(fsr->FileSet));
585 if (jcr->fileset->have_MD5) {
586 struct MD5Context md5c;
587 unsigned char signature[16];
588 memcpy(&md5c, &jcr->fileset->md5c, sizeof(md5c));
589 MD5Final(signature, &md5c);
590 bin_to_base64(fsr->MD5, (char *)signature, 16); /* encode 16 bytes */
591 bstrncpy(jcr->fileset->MD5, fsr->MD5, sizeof(jcr->fileset->MD5));
593 Jmsg(jcr, M_WARNING, 0, _("FileSet MD5 signature not found.\n"));
595 if (!jcr->fileset->ignore_fs_changes ||
596 !db_get_fileset_record(jcr, jcr->db, fsr)) {
597 if (!db_create_fileset_record(jcr, jcr->db, fsr)) {
598 Jmsg(jcr, M_ERROR, 0, _("Could not create FileSet \"%s\" record. ERR=%s\n"),
599 fsr->FileSet, db_strerror(jcr->db));
603 jcr->jr.FileSetId = fsr->FileSetId;
605 Jmsg(jcr, M_INFO, 0, _("Created new FileSet record \"%s\" %s\n"),
606 fsr->FileSet, fsr->cCreateTime);
608 Dmsg2(119, "Created FileSet %s record %u\n", jcr->fileset->hdr.name,
613 void init_jcr_job_record(JCR *jcr)
615 jcr->jr.SchedTime = jcr->sched_time;
616 jcr->jr.StartTime = jcr->start_time;
617 jcr->jr.EndTime = 0; /* perhaps rescheduled, clear it */
618 jcr->jr.JobType = jcr->JobType;
619 jcr->jr.JobLevel = jcr->JobLevel;
620 jcr->jr.JobStatus = jcr->JobStatus;
621 jcr->jr.JobId = jcr->JobId;
622 bstrncpy(jcr->jr.Name, jcr->job->hdr.name, sizeof(jcr->jr.Name));
623 bstrncpy(jcr->jr.Job, jcr->Job, sizeof(jcr->jr.Job));
627 * Write status and such in DB
629 void update_job_end_record(JCR *jcr)
631 jcr->jr.EndTime = time(NULL);
632 jcr->end_time = jcr->jr.EndTime;
633 jcr->jr.JobId = jcr->JobId;
634 jcr->jr.JobStatus = jcr->JobStatus;
635 jcr->jr.JobFiles = jcr->JobFiles;
636 jcr->jr.JobBytes = jcr->JobBytes;
637 jcr->jr.VolSessionId = jcr->VolSessionId;
638 jcr->jr.VolSessionTime = jcr->VolSessionTime;
639 if (!db_update_job_end_record(jcr, jcr->db, &jcr->jr)) {
640 Jmsg(jcr, M_WARNING, 0, _("Error updating job record. %s"),
641 db_strerror(jcr->db));
646 * Takes base_name and appends (unique) current
647 * date and time to form unique job name.
649 * Returns: unique job name in jcr->Job
650 * date/time in jcr->start_time
652 void create_unique_job_name(JCR *jcr, const char *base_name)
654 /* Job start mutex */
655 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
656 static time_t last_start_time = 0;
659 char dt[MAX_TIME_LENGTH];
660 char name[MAX_NAME_LENGTH];
663 /* Guarantee unique start time -- maximum one per second, and
664 * thus unique Job Name
666 P(mutex); /* lock creation of jobs */
668 while (now == last_start_time) {
669 bmicrosleep(0, 500000);
672 last_start_time = now;
673 V(mutex); /* allow creation of jobs */
674 jcr->start_time = now;
675 /* Form Unique JobName */
676 localtime_r(&now, &tm);
677 /* Use only characters that are permitted in Windows filenames */
678 strftime(dt, sizeof(dt), "%Y-%m-%d_%H.%M.%S", &tm);
679 bstrncpy(name, base_name, sizeof(name));
680 name[sizeof(name)-22] = 0; /* truncate if too long */
681 bsnprintf(jcr->Job, sizeof(jcr->Job), "%s.%s", name, dt); /* add date & time */
682 /* Convert spaces into underscores */
683 for (p=jcr->Job; *p; p++) {
691 * Free the Job Control Record if no one is still using it.
692 * Called from main free_jcr() routine in src/lib/jcr.c so
693 * that we can do our Director specific cleanup of the jcr.
695 void dird_free_jcr(JCR *jcr)
697 Dmsg0(200, "Start dird free_jcr\n");
699 if (jcr->sd_auth_key) {
700 free(jcr->sd_auth_key);
701 jcr->sd_auth_key = NULL;
707 if (jcr->file_bsock) {
708 Dmsg0(200, "Close File bsock\n");
709 bnet_close(jcr->file_bsock);
710 jcr->file_bsock = NULL;
712 if (jcr->store_bsock) {
713 Dmsg0(200, "Close Store bsock\n");
714 bnet_close(jcr->store_bsock);
715 jcr->store_bsock = NULL;
718 Dmsg0(200, "Free JCR fname\n");
719 free_pool_memory(jcr->fname);
723 Dmsg0(200, "Free JCR stime\n");
724 free_pool_memory(jcr->stime);
727 if (jcr->RestoreBootstrap) {
728 free(jcr->RestoreBootstrap);
729 jcr->RestoreBootstrap = NULL;
731 if (jcr->client_uname) {
732 free_pool_memory(jcr->client_uname);
733 jcr->client_uname = NULL;
735 if (jcr->term_wait_inited) {
736 pthread_cond_destroy(&jcr->term_wait);
738 /* Delete lists setup to hold storage pointers */
742 jcr->job_end_push.destroy();
743 Dmsg0(200, "End dird free_jcr\n");
747 * Set some defaults in the JCR necessary to
748 * run. These items are pulled from the job
749 * definition as defaults, but can be overridden
750 * later either by the Run record in the Schedule resource,
751 * or by the Console program.
753 void set_jcr_defaults(JCR *jcr, JOB *job)
757 jcr->JobType = job->JobType;
758 switch (jcr->JobType) {
761 jcr->JobLevel = L_NONE;
764 jcr->JobLevel = job->JobLevel;
767 jcr->JobPriority = job->Priority;
768 /* Copy storage definitions -- deleted in dir_free_jcr above */
773 jcr->storage = New(alist(10, not_owned_by_alist));
774 foreach_alist(st, job->storage) {
775 jcr->storage->append(st);
779 jcr->store = (STORE *)jcr->storage->first();
781 jcr->client = job->client;
782 if (!jcr->client_name) {
783 jcr->client_name = get_pool_memory(PM_NAME);
785 pm_strcpy(jcr->client_name, jcr->client->hdr.name);
786 jcr->pool = job->pool;
787 jcr->full_pool = job->full_pool;
788 jcr->inc_pool = job->inc_pool;
789 jcr->dif_pool = job->dif_pool;
790 jcr->catalog = job->client->catalog;
791 jcr->fileset = job->fileset;
792 jcr->messages = job->messages;
793 jcr->spool_data = job->spool_data;
794 jcr->write_part_after_job = job->write_part_after_job;
795 if (jcr->RestoreBootstrap) {
796 free(jcr->RestoreBootstrap);
797 jcr->RestoreBootstrap = NULL;
799 /* This can be overridden by Console program */
800 if (job->RestoreBootstrap) {
801 jcr->RestoreBootstrap = bstrdup(job->RestoreBootstrap);
803 /* This can be overridden by Console program */
804 jcr->verify_job = job->verify_job;
805 /* If no default level given, set one */
806 if (jcr->JobLevel == 0) {
807 switch (jcr->JobType) {
809 jcr->JobLevel = L_VERIFY_CATALOG;
812 jcr->JobLevel = L_INCREMENTAL;
816 jcr->JobLevel = L_NONE;
825 * copy the storage definitions from an old JCR to a new one
827 void copy_storage(JCR *new_jcr, JCR *old_jcr)
829 if (old_jcr->storage) {
831 if (old_jcr->storage) {
832 delete old_jcr->storage;
834 new_jcr->storage = New(alist(10, not_owned_by_alist));
835 foreach_alist(st, old_jcr->storage) {
836 new_jcr->storage->append(st);
839 if (old_jcr->store) {
840 new_jcr->store = old_jcr->store;
841 } else if (new_jcr->storage) {
842 new_jcr->store = (STORE *)new_jcr->storage->first();
846 /* Set storage override */
847 void set_storage(JCR *jcr, STORE *store)
852 foreach_alist(storage, jcr->storage) {
853 if (store == storage) {
857 /* Store not in list, so add it */
858 jcr->storage->prepend(store);