3 * Bacula Director Job processing routines
5 * Kern Sibbald, October MM
10 Copyright (C) 2000-2004 Kern Sibbald and John Walker
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of
15 the License, or (at your option) any later version.
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public
23 License along with this program; if not, write to the Free
24 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
32 /* Forward referenced subroutines */
33 static void *job_thread(void *arg);
34 static void job_monitor_watchdog(watchdog_t *self);
35 static void job_monitor_destructor(watchdog_t *self);
36 static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr);
37 static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr);
39 /* Exported subroutines */
41 /* Imported subroutines */
42 extern void term_scheduler();
43 extern void term_ua_server();
44 extern int do_backup(JCR *jcr);
45 extern int do_admin(JCR *jcr);
46 extern int do_restore(JCR *jcr);
47 extern int do_verify(JCR *jcr);
49 /* Imported variables */
50 extern time_t watchdog_time;
54 void init_job_server(int max_workers)
59 if ((stat = jobq_init(&job_queue, max_workers, job_thread)) != 0) {
60 Emsg1(M_ABORT, 0, _("Could not init job queue: ERR=%s\n"), strerror(stat));
62 if ((wd = watchdog_new()) == NULL) {
63 Emsg0(M_ABORT, 0, _("Could not init job monitor watchdogs\n"));
65 wd->callback = job_monitor_watchdog;
66 wd->destructor = job_monitor_destructor;
69 wd->data = create_control_jcr("*JobMonitor*", JT_SYSTEM);
70 register_watchdog(wd);
75 static void job_monitor_destructor(watchdog_t *self)
77 JCR *control_jcr = (JCR *) self->data;
79 free_jcr(control_jcr);
82 static void job_monitor_watchdog(watchdog_t *self)
84 JCR *control_jcr, *jcr;
86 control_jcr = (JCR *) self->data;
88 Dmsg1(200, "job_monitor_watchdog %p called\n", self);
92 for (jcr = NULL; (jcr = get_next_jcr(jcr)); /* nothing */) {
95 if (jcr->JobId == 0) {
96 Dmsg2(200, "Skipping JCR %p (%s) with JobId 0\n",
98 /* Keep reference counts correct */
103 /* check MaxWaitTime */
104 cancel = job_check_maxwaittime(control_jcr, jcr);
106 /* check MaxRunTime */
107 cancel |= job_check_maxruntime(control_jcr, jcr);
110 Dmsg3(200, "Cancelling JCR %p jobid %d (%s)\n",
111 jcr, jcr->JobId, jcr->Job);
113 UAContext *ua = new_ua_context(jcr);
114 ua->jcr = control_jcr;
118 Dmsg1(200, "Have cancelled JCR %p\n", jcr);
121 /* Keep reference counts correct */
122 free_locked_jcr(jcr);
128 * Check if the maxwaittime has expired and it is possible
131 static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr)
135 if (jcr->job->MaxWaitTime == 0) {
138 if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) {
139 Dmsg3(200, "Job %p (%s) with MaxWaitTime %d not expired\n",
140 jcr, jcr->Job, jcr->job->MaxWaitTime);
143 Dmsg3(200, "Job %d (%s): MaxWaitTime of %d seconds exceeded, "
145 jcr->JobId, jcr->Job, jcr->job->MaxWaitTime);
146 switch (jcr->JobStatus) {
151 case JS_WaitStoreRes:
152 case JS_WaitClientRes:
154 case JS_WaitPriority:
156 case JS_WaitStartTime:
158 Dmsg0(200, "JCR blocked in #1\n");
161 Dmsg0(200, "JCR running, checking SD status\n");
162 switch (jcr->SDJobStatus) {
167 Dmsg0(200, "JCR blocked in #2\n");
170 Dmsg0(200, "JCR not blocked in #2\n");
175 case JS_ErrorTerminated:
178 Dmsg0(200, "JCR already dead in #3\n");
181 Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"),
184 Dmsg3(200, "MaxWaitTime result: %scancel JCR %p (%s)\n",
185 cancel ? "" : "do not ", jcr, jcr->job);
191 * Check if maxruntime has expired and if the job can be
194 static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr)
198 if (jcr->job->MaxRunTime == 0) {
201 if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) {
202 Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n",
203 jcr, jcr->Job, jcr->job->MaxRunTime);
207 switch (jcr->JobStatus) {
213 case JS_WaitStoreRes:
214 case JS_WaitClientRes:
216 case JS_WaitPriority:
218 case JS_WaitStartTime:
223 case JS_ErrorTerminated:
229 Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"),
233 Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n",
234 cancel ? "" : "do not ", jcr, jcr->job);
240 * Run a job -- typically called by the scheduler, but may also
241 * be called by the UA (Console program).
244 void run_job(JCR *jcr)
249 sm_check(__FILE__, __LINE__, True);
250 init_msg(jcr, jcr->messages);
251 create_unique_job_name(jcr, jcr->job->hdr.name);
252 set_jcr_job_status(jcr, JS_Created);
253 jcr->jr.SchedTime = jcr->sched_time;
254 jcr->jr.StartTime = jcr->start_time;
255 jcr->jr.EndTime = 0; /* perhaps rescheduled, clear it */
256 jcr->jr.Type = jcr->JobType;
257 jcr->jr.Level = jcr->JobLevel;
258 jcr->jr.JobStatus = jcr->JobStatus;
259 bstrncpy(jcr->jr.Name, jcr->job->hdr.name, sizeof(jcr->jr.Name));
260 bstrncpy(jcr->jr.Job, jcr->Job, sizeof(jcr->jr.Job));
262 /* Initialize termination condition variable */
263 if ((errstat = pthread_cond_init(&jcr->term_wait, NULL)) != 0) {
264 Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), strerror(errstat));
271 Dmsg0(50, "Open database\n");
272 jcr->db=db_init_database(jcr, jcr->catalog->db_name, jcr->catalog->db_user,
273 jcr->catalog->db_password, jcr->catalog->db_address,
274 jcr->catalog->db_port, jcr->catalog->db_socket);
275 if (!jcr->db || !db_open_database(jcr, jcr->db)) {
276 Jmsg(jcr, M_FATAL, 0, _("Could not open database \"%s\".\n"),
277 jcr->catalog->db_name);
279 Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db));
283 Dmsg0(50, "DB opened\n");
288 jcr->jr.JobStatus = jcr->JobStatus;
289 if (!db_create_job_record(jcr, jcr->db, &jcr->jr)) {
290 Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db));
293 jcr->JobId = jcr->jr.JobId;
294 ASSERT(jcr->jr.JobId > 0);
296 Dmsg4(50, "Created job record JobId=%d Name=%s Type=%c Level=%c\n",
297 jcr->JobId, jcr->Job, jcr->jr.Type, jcr->jr.Level);
298 Dmsg0(200, "Add jrc to work queue\n");
300 /* Queue the job to be run */
301 if ((stat = jobq_add(&job_queue, jcr)) != 0) {
302 Emsg1(M_ABORT, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat));
304 Dmsg0(100, "Done run_job()\n");
310 set_jcr_job_status(jcr, JS_ErrorTerminated);
317 * Cancel a job -- typically called by the UA (Console program), but may also
318 * be called by the job watchdog.
320 * Returns: 1 if cancel appears to be successful
321 * 0 on failure. Message sent to ua->jcr.
323 int cancel_job(UAContext *ua, JCR *jcr)
327 switch (jcr->JobStatus) {
330 case JS_WaitClientRes:
331 case JS_WaitStoreRes:
332 case JS_WaitPriority:
334 case JS_WaitStartTime:
335 set_jcr_job_status(jcr, JS_Canceled);
336 bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"),
337 jcr->JobId, jcr->Job);
338 jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */
342 set_jcr_job_status(jcr, JS_Canceled);
344 /* Cancel File daemon */
345 if (jcr->file_bsock) {
346 ua->jcr->client = jcr->client;
347 if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) {
348 bsendmsg(ua, _("Failed to connect to File daemon.\n"));
351 Dmsg0(200, "Connected to file daemon\n");
352 fd = ua->jcr->file_bsock;
353 bnet_fsend(fd, "cancel Job=%s\n", jcr->Job);
354 while (bnet_recv(fd) >= 0) {
355 bsendmsg(ua, "%s", fd->msg);
357 bnet_sig(fd, BNET_TERMINATE);
359 ua->jcr->file_bsock = NULL;
362 /* Cancel Storage daemon */
363 if (jcr->store_bsock) {
364 ua->jcr->store = jcr->store;
365 if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) {
366 bsendmsg(ua, _("Failed to connect to Storage daemon.\n"));
369 Dmsg0(200, "Connected to storage daemon\n");
370 sd = ua->jcr->store_bsock;
371 bnet_fsend(sd, "cancel Job=%s\n", jcr->Job);
372 while (bnet_recv(sd) >= 0) {
373 bsendmsg(ua, "%s", sd->msg);
375 bnet_sig(sd, BNET_TERMINATE);
377 ua->jcr->store_bsock = NULL;
385 * This is the engine called by jobq.c:jobq_add() when we were pulled
386 * from the work queue.
387 * At this point, we are running in our own thread and all
388 * necessary resources are allocated -- see jobq.c
390 static void *job_thread(void *arg)
392 JCR *jcr = (JCR *)arg;
394 pthread_detach(pthread_self());
395 sm_check(__FILE__, __LINE__, True);
399 Dmsg0(200, "=====Start Job=========\n");
400 jcr->start_time = time(NULL); /* set the real start time */
401 set_jcr_job_status(jcr, JS_Running);
403 if (job_canceled(jcr)) {
404 update_job_end_record(jcr);
405 } else if (jcr->job->MaxStartDelay != 0 && jcr->job->MaxStartDelay <
406 (utime_t)(jcr->start_time - jcr->sched_time)) {
407 Jmsg(jcr, M_FATAL, 0, _("Job canceled because max start delay time exceeded.\n"));
408 set_jcr_job_status(jcr, JS_Canceled);
409 update_job_end_record(jcr);
413 if (jcr->job->RunBeforeJob) {
414 POOLMEM *before = get_pool_memory(PM_FNAME);
417 char line[MAXSTRING];
419 before = edit_job_codes(jcr, before, jcr->job->RunBeforeJob, "");
420 bpipe = open_bpipe(before, 0, "r");
421 free_pool_memory(before);
422 while (fgets(line, sizeof(line), bpipe->rfd)) {
423 Jmsg(jcr, M_INFO, 0, _("RunBefore: %s"), line);
425 status = close_bpipe(bpipe);
427 Jmsg(jcr, M_FATAL, 0, _("RunBeforeJob returned non-zero status=%d\n"),
429 set_jcr_job_status(jcr, JS_FatalError);
430 update_job_end_record(jcr);
434 switch (jcr->JobType) {
437 if (jcr->JobStatus == JS_Terminated) {
443 if (jcr->JobStatus == JS_Terminated) {
449 if (jcr->JobStatus == JS_Terminated) {
455 if (jcr->JobStatus == JS_Terminated) {
460 Pmsg1(0, "Unimplemented job type: %d\n", jcr->JobType);
463 if ((jcr->job->RunAfterJob && jcr->JobStatus == JS_Terminated) ||
464 (jcr->job->RunAfterFailedJob && jcr->JobStatus != JS_Terminated)) {
465 POOLMEM *after = get_pool_memory(PM_FNAME);
468 char line[MAXSTRING];
470 if (jcr->JobStatus == JS_Terminated) {
471 after = edit_job_codes(jcr, after, jcr->job->RunAfterJob, "");
473 after = edit_job_codes(jcr, after, jcr->job->RunAfterFailedJob, "");
475 bpipe = open_bpipe(after, 0, "r");
476 free_pool_memory(after);
477 while (fgets(line, sizeof(line), bpipe->rfd)) {
478 Jmsg(jcr, M_INFO, 0, _("RunAfter: %s"), line);
480 status = close_bpipe(bpipe);
482 * Note, if we get an error here, do not mark the
483 * job in error, simply report the error condition.
486 if (jcr->JobStatus == JS_Terminated) {
487 Jmsg(jcr, M_ERROR, 0, _("RunAfterJob returned non-zero status=%d\n"),
490 Jmsg(jcr, M_FATAL, 0, _("RunAfterFailedJob returned non-zero status=%d\n"),
500 Dmsg0(50, "======== End Job ==========\n");
501 sm_check(__FILE__, __LINE__, True);
507 * Get or create a Client record for this Job
509 int get_or_create_client_record(JCR *jcr)
513 memset(&cr, 0, sizeof(cr));
514 bstrncpy(cr.Name, jcr->client->hdr.name, sizeof(cr.Name));
515 cr.AutoPrune = jcr->client->AutoPrune;
516 cr.FileRetention = jcr->client->FileRetention;
517 cr.JobRetention = jcr->client->JobRetention;
518 if (!jcr->client_name) {
519 jcr->client_name = get_pool_memory(PM_NAME);
521 pm_strcpy(&jcr->client_name, jcr->client->hdr.name);
522 if (!db_create_client_record(jcr, jcr->db, &cr)) {
523 Jmsg(jcr, M_FATAL, 0, _("Could not create Client record. ERR=%s\n"),
524 db_strerror(jcr->db));
527 jcr->jr.ClientId = cr.ClientId;
529 if (!jcr->client_uname) {
530 jcr->client_uname = get_pool_memory(PM_NAME);
532 pm_strcpy(&jcr->client_uname, cr.Uname);
534 Dmsg2(100, "Created Client %s record %d\n", jcr->client->hdr.name,
541 * Write status and such in DB
543 void update_job_end_record(JCR *jcr)
545 if (jcr->jr.EndTime == 0) {
546 jcr->jr.EndTime = time(NULL);
548 jcr->end_time = jcr->jr.EndTime;
549 jcr->jr.JobId = jcr->JobId;
550 jcr->jr.JobStatus = jcr->JobStatus;
551 jcr->jr.JobFiles = jcr->JobFiles;
552 jcr->jr.JobBytes = jcr->JobBytes;
553 jcr->jr.VolSessionId = jcr->VolSessionId;
554 jcr->jr.VolSessionTime = jcr->VolSessionTime;
555 if (!db_update_job_end_record(jcr, jcr->db, &jcr->jr)) {
556 Jmsg(jcr, M_WARNING, 0, _("Error updating job record. %s"),
557 db_strerror(jcr->db));
562 * Takes base_name and appends (unique) current
563 * date and time to form unique job name.
565 * Returns: unique job name in jcr->Job
566 * date/time in jcr->start_time
568 void create_unique_job_name(JCR *jcr, char *base_name)
570 /* Job start mutex */
571 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
572 static time_t last_start_time = 0;
575 char dt[MAX_TIME_LENGTH];
576 char name[MAX_NAME_LENGTH];
579 /* Guarantee unique start time -- maximum one per second, and
580 * thus unique Job Name
582 P(mutex); /* lock creation of jobs */
584 while (now == last_start_time) {
585 bmicrosleep(0, 500000);
588 last_start_time = now;
589 V(mutex); /* allow creation of jobs */
590 jcr->start_time = now;
591 /* Form Unique JobName */
592 localtime_r(&now, &tm);
593 /* Use only characters that are permitted in Windows filenames */
594 strftime(dt, sizeof(dt), "%Y-%m-%d_%H.%M.%S", &tm);
595 bstrncpy(name, base_name, sizeof(name));
596 name[sizeof(name)-22] = 0; /* truncate if too long */
597 bsnprintf(jcr->Job, sizeof(jcr->Job), "%s.%s", name, dt); /* add date & time */
598 /* Convert spaces into underscores */
599 for (p=jcr->Job; *p; p++) {
607 * Free the Job Control Record if no one is still using it.
608 * Called from main free_jcr() routine in src/lib/jcr.c so
609 * that we can do our Director specific cleanup of the jcr.
611 void dird_free_jcr(JCR *jcr)
613 Dmsg0(200, "Start dird free_jcr\n");
615 if (jcr->sd_auth_key) {
616 free(jcr->sd_auth_key);
617 jcr->sd_auth_key = NULL;
623 if (jcr->file_bsock) {
624 Dmsg0(200, "Close File bsock\n");
625 bnet_close(jcr->file_bsock);
626 jcr->file_bsock = NULL;
628 if (jcr->store_bsock) {
629 Dmsg0(200, "Close Store bsock\n");
630 bnet_close(jcr->store_bsock);
631 jcr->store_bsock = NULL;
634 Dmsg0(200, "Free JCR fname\n");
635 free_pool_memory(jcr->fname);
639 Dmsg0(200, "Free JCR stime\n");
640 free_pool_memory(jcr->stime);
643 if (jcr->RestoreBootstrap) {
644 free(jcr->RestoreBootstrap);
645 jcr->RestoreBootstrap = NULL;
647 if (jcr->client_uname) {
648 free_pool_memory(jcr->client_uname);
649 jcr->client_uname = NULL;
651 pthread_cond_destroy(&jcr->term_wait);
652 Dmsg0(200, "End dird free_jcr\n");
656 * Set some defaults in the JCR necessary to
657 * run. These items are pulled from the job
658 * definition as defaults, but can be overridden
659 * later either by the Run record in the Schedule resource,
660 * or by the Console program.
662 void set_jcr_defaults(JCR *jcr, JOB *job)
665 jcr->JobType = job->JobType;
666 switch (jcr->JobType) {
669 jcr->JobLevel = L_NONE;
672 jcr->JobLevel = job->level;
675 jcr->JobPriority = job->Priority;
676 jcr->store = job->storage;
677 jcr->client = job->client;
678 if (!jcr->client_name) {
679 jcr->client_name = get_pool_memory(PM_NAME);
681 pm_strcpy(&jcr->client_name, jcr->client->hdr.name);
682 jcr->pool = job->pool;
683 jcr->catalog = job->client->catalog;
684 jcr->fileset = job->fileset;
685 jcr->messages = job->messages;
686 if (jcr->RestoreBootstrap) {
687 free(jcr->RestoreBootstrap);
688 jcr->RestoreBootstrap = NULL;
690 /* This can be overridden by Console program */
691 if (job->RestoreBootstrap) {
692 jcr->RestoreBootstrap = bstrdup(job->RestoreBootstrap);
694 /* If no default level given, set one */
695 if (jcr->JobLevel == 0) {
696 switch (jcr->JobType) {
698 jcr->JobLevel = L_VERIFY_CATALOG;
701 jcr->JobLevel = L_INCREMENTAL;
705 jcr->JobLevel = L_NONE;