X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=bacula%2Fsrc%2Fdird%2Fjob.c;h=f26f2b369ca3a7f6ab633bfb557467c9b2a6b152;hb=7ebf8f564b27ca8448a9a7365ba73b130ae69c21;hp=db7064e71619935cb33075ed8c6ab4670bd16b66;hpb=58fc064abf7cbcf9cae52e42ef28f72a90284efb;p=bacula%2Fbacula diff --git a/bacula/src/dird/job.c b/bacula/src/dird/job.c index db7064e716..f26f2b369c 100644 --- a/bacula/src/dird/job.c +++ b/bacula/src/dird/job.c @@ -7,7 +7,7 @@ * Version $Id$ */ /* - Copyright (C) 2000-2003 Kern Sibbald and John Walker + Copyright (C) 2000-2004 Kern Sibbald and John Walker This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as @@ -31,10 +31,13 @@ /* Forward referenced subroutines */ static void *job_thread(void *arg); +static void job_monitor_watchdog(watchdog_t *self); +static void job_monitor_destructor(watchdog_t *self); +static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr); +static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr); /* Exported subroutines */ - /* Imported subroutines */ extern void term_scheduler(); extern void term_ua_server(); @@ -43,15 +46,28 @@ extern int do_admin(JCR *jcr); extern int do_restore(JCR *jcr); extern int do_verify(JCR *jcr); +/* Imported variables */ +extern time_t watchdog_time; + jobq_t job_queue; void init_job_server(int max_workers) { int stat; + watchdog_t *wd; + if ((stat = jobq_init(&job_queue, max_workers, job_thread)) != 0) { Emsg1(M_ABORT, 0, _("Could not init job queue: ERR=%s\n"), strerror(stat)); } - return; + if ((wd = new_watchdog()) == NULL) { + Emsg0(M_ABORT, 0, _("Could not init job monitor watchdogs\n")); + } + wd->callback = job_monitor_watchdog; + wd->destructor = job_monitor_destructor; + wd->one_shot = false; + wd->interval = 60; + wd->data = new_control_jcr("*JobMonitor*", JT_SYSTEM); + register_watchdog(wd); } /* @@ -63,7 +79,8 @@ void run_job(JCR *jcr) { int stat, errstat; - sm_check(__FILE__, __LINE__, True); + P(jcr->mutex); + sm_check(__FILE__, __LINE__, true); init_msg(jcr, jcr->messages); create_unique_job_name(jcr, jcr->job->hdr.name); set_jcr_job_status(jcr, JS_Created); @@ -79,9 +96,7 @@ void run_job(JCR *jcr) /* Initialize termination condition variable */ if ((errstat = pthread_cond_init(&jcr->term_wait, NULL)) != 0) { Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), strerror(errstat)); - set_jcr_job_status(jcr, JS_ErrorTerminated); - free_jcr(jcr); - return; + goto bail_out; } /* @@ -97,9 +112,7 @@ void run_job(JCR *jcr) if (jcr->db) { Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db)); } - set_jcr_job_status(jcr, JS_ErrorTerminated); - free_jcr(jcr); - return; + goto bail_out; } Dmsg0(50, "DB opened\n"); @@ -109,12 +122,9 @@ void run_job(JCR *jcr) jcr->jr.JobStatus = jcr->JobStatus; if (!db_create_job_record(jcr, jcr->db, &jcr->jr)) { Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db)); - set_jcr_job_status(jcr, JS_ErrorTerminated); - free_jcr(jcr); - return; + goto bail_out; } jcr->JobId = jcr->jr.JobId; - ASSERT(jcr->jr.JobId > 0); Dmsg4(50, "Created job record JobId=%d Name=%s Type=%c Level=%c\n", jcr->JobId, jcr->Job, jcr->jr.Type, jcr->jr.Level); @@ -122,13 +132,24 @@ void run_job(JCR *jcr) /* Queue the job to be run */ if ((stat = jobq_add(&job_queue, jcr)) != 0) { - Emsg1(M_ABORT, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat)); + Jmsg(jcr, M_FATAL, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat)); + goto bail_out; } Dmsg0(100, "Done run_job()\n"); + + V(jcr->mutex); + return; + +bail_out: + set_jcr_job_status(jcr, JS_ErrorTerminated); + V(jcr->mutex); + return; + } + /* - * This is the engine called by job_add() when we were pulled + * This is the engine called by jobq.c:jobq_add() when we were pulled * from the work queue. * At this point, we are running in our own thread and all * necessary resources are allocated -- see jobq.c @@ -137,8 +158,9 @@ static void *job_thread(void *arg) { JCR *jcr = (JCR *)arg; - pthread_detach(pthread_self()); - sm_check(__FILE__, __LINE__, True); + jcr->my_thread_id = pthread_self(); + pthread_detach(jcr->my_thread_id); + sm_check(__FILE__, __LINE__, true); for ( ;; ) { @@ -224,29 +246,269 @@ static void *job_thread(void *arg) Jmsg(jcr, M_INFO, 0, _("RunAfter: %s"), line); } status = close_bpipe(bpipe); + /* + * Note, if we get an error here, do not mark the + * job in error, simply report the error condition. + */ if (status != 0) { if (jcr->JobStatus == JS_Terminated) { - Jmsg(jcr, M_FATAL, 0, _("RunAfterJob returned non-zero status=%d\n"), + Jmsg(jcr, M_WARNING, 0, _("RunAfterJob returned non-zero status=%d\n"), status); } else { Jmsg(jcr, M_FATAL, 0, _("RunAfterFailedJob returned non-zero status=%d\n"), status); } - set_jcr_job_status(jcr, JS_FatalError); - update_job_end_record(jcr); } } + /* Send off any queued messages */ + if (jcr->msg_queue->size() > 0) { + dequeue_messages(jcr); + } } bail_out: break; } Dmsg0(50, "======== End Job ==========\n"); - sm_check(__FILE__, __LINE__, True); + sm_check(__FILE__, __LINE__, true); return NULL; } +/* + * Cancel a job -- typically called by the UA (Console program), but may also + * be called by the job watchdog. + * + * Returns: 1 if cancel appears to be successful + * 0 on failure. Message sent to ua->jcr. + */ +int cancel_job(UAContext *ua, JCR *jcr) +{ + BSOCK *sd, *fd; + + switch (jcr->JobStatus) { + case JS_Created: + case JS_WaitJobRes: + case JS_WaitClientRes: + case JS_WaitStoreRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + set_jcr_job_status(jcr, JS_Canceled); + bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"), + jcr->JobId, jcr->Job); + jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ + return 1; + + default: + set_jcr_job_status(jcr, JS_Canceled); + + /* Cancel File daemon */ + if (jcr->file_bsock) { + ua->jcr->client = jcr->client; + if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to File daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to file daemon\n"); + fd = ua->jcr->file_bsock; + bnet_fsend(fd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(fd) >= 0) { + bsendmsg(ua, "%s", fd->msg); + } + bnet_sig(fd, BNET_TERMINATE); + bnet_close(fd); + ua->jcr->file_bsock = NULL; + } + + /* Cancel Storage daemon */ + if (jcr->store_bsock) { + ua->jcr->store = jcr->store; + if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to Storage daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to storage daemon\n"); + sd = ua->jcr->store_bsock; + bnet_fsend(sd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(sd) >= 0) { + bsendmsg(ua, "%s", sd->msg); + } + bnet_sig(sd, BNET_TERMINATE); + bnet_close(sd); + ua->jcr->store_bsock = NULL; + } + } + + return 1; +} + + +static void job_monitor_destructor(watchdog_t *self) +{ + JCR *control_jcr = (JCR *) self->data; + + free_jcr(control_jcr); +} + +static void job_monitor_watchdog(watchdog_t *self) +{ + JCR *control_jcr, *jcr; + + control_jcr = (JCR *)self->data; + + Dmsg1(400, "job_monitor_watchdog %p called\n", self); + + lock_jcr_chain(); + + foreach_jcr(jcr) { + bool cancel; + + if (jcr->JobId == 0) { + Dmsg2(400, "Skipping JCR %p (%s) with JobId 0\n", + jcr, jcr->Job); + /* Keep reference counts correct */ + free_locked_jcr(jcr); + continue; + } + + /* check MaxWaitTime */ + cancel = job_check_maxwaittime(control_jcr, jcr); + + /* check MaxRunTime */ + cancel |= job_check_maxruntime(control_jcr, jcr); + + if (cancel) { + Dmsg3(200, "Cancelling JCR %p jobid %d (%s)\n", + jcr, jcr->JobId, jcr->Job); + + UAContext *ua = new_ua_context(jcr); + ua->jcr = control_jcr; + cancel_job(ua, jcr); + free_ua_context(ua); + + Dmsg1(200, "Have cancelled JCR %p\n", jcr); + } + + /* Keep reference counts correct */ + free_locked_jcr(jcr); + } + unlock_jcr_chain(); +} + +/* + * Check if the maxwaittime has expired and it is possible + * to cancel the job. + */ +static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr) +{ + bool cancel = false; + + if (jcr->job->MaxWaitTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) { + Dmsg3(200, "Job %p (%s) with MaxWaitTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxWaitTime); + return false; + } + Dmsg3(200, "Job %d (%s): MaxWaitTime of %d seconds exceeded, " + "checking status\n", + jcr->JobId, jcr->Job, jcr->job->MaxWaitTime); + switch (jcr->JobStatus) { + case JS_Created: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + cancel = true; + Dmsg0(200, "JCR blocked in #1\n"); + break; + case JS_Running: + Dmsg0(200, "JCR running, checking SD status\n"); + switch (jcr->SDJobStatus) { + case JS_WaitMount: + case JS_WaitMedia: + case JS_WaitFD: + cancel = true; + Dmsg0(200, "JCR blocked in #2\n"); + break; + default: + Dmsg0(200, "JCR not blocked in #2\n"); + break; + } + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + Dmsg0(200, "JCR already dead in #3\n"); + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); + } + Dmsg3(200, "MaxWaitTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; +} + +/* + * Check if maxruntime has expired and if the job can be + * canceled. + */ +static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr) +{ + bool cancel = false; + + if (jcr->job->MaxRunTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) { + Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxRunTime); + return false; + } + + switch (jcr->JobStatus) { + case JS_Created: + case JS_Running: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + case JS_Differences: + cancel = true; + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + cancel = false; + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); + } + + Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; +} + + /* * Get or create a Client record for this Job */ @@ -407,7 +669,15 @@ void set_jcr_defaults(JCR *jcr, JOB *job) { jcr->job = job; jcr->JobType = job->JobType; - jcr->JobLevel = job->level; + switch (jcr->JobType) { + case JT_ADMIN: + case JT_RESTORE: + jcr->JobLevel = L_NONE; + break; + default: + jcr->JobLevel = job->level; + break; + } jcr->JobPriority = job->Priority; jcr->store = job->storage; jcr->client = job->client; @@ -416,11 +686,16 @@ void set_jcr_defaults(JCR *jcr, JOB *job) } pm_strcpy(&jcr->client_name, jcr->client->hdr.name); jcr->pool = job->pool; + jcr->full_pool = job->full_pool; + jcr->inc_pool = job->inc_pool; + jcr->dif_pool = job->dif_pool; jcr->catalog = job->client->catalog; jcr->fileset = job->fileset; jcr->messages = job->messages; + jcr->spool_data = job->spool_data; if (jcr->RestoreBootstrap) { free(jcr->RestoreBootstrap); + jcr->RestoreBootstrap = NULL; } /* This can be overridden by Console program */ if (job->RestoreBootstrap) { @@ -437,7 +712,7 @@ void set_jcr_defaults(JCR *jcr, JOB *job) break; case JT_RESTORE: case JT_ADMIN: - jcr->JobLevel = L_FULL; + jcr->JobLevel = L_NONE; break; default: break;