X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;ds=sidebyside;f=bacula%2Fsrc%2Fdird%2Fjob.c;h=f26f2b369ca3a7f6ab633bfb557467c9b2a6b152;hb=7ebf8f564b27ca8448a9a7365ba73b130ae69c21;hp=2518daf0a37fee34b82a61504dead14ee265fccb;hpb=900f92991db89c97160e5bc6d812249a45b3290d;p=bacula%2Fbacula diff --git a/bacula/src/dird/job.c b/bacula/src/dird/job.c index 2518daf0a3..f26f2b369c 100644 --- a/bacula/src/dird/job.c +++ b/bacula/src/dird/job.c @@ -66,174 +66,8 @@ void init_job_server(int max_workers) wd->destructor = job_monitor_destructor; wd->one_shot = false; wd->interval = 60; - wd->data = create_control_jcr("*JobMonitor*", JT_SYSTEM); + wd->data = new_control_jcr("*JobMonitor*", JT_SYSTEM); register_watchdog(wd); - - return; -} - -static void job_monitor_destructor(watchdog_t *self) -{ - JCR *control_jcr = (JCR *) self->data; - - free_jcr(control_jcr); -} - -static void job_monitor_watchdog(watchdog_t *self) -{ - JCR *control_jcr, *jcr; - - control_jcr = (JCR *) self->data; - - Dmsg1(200, "job_monitor_watchdog %p called\n", self); - - lock_jcr_chain(); - - for (jcr = NULL; (jcr = get_next_jcr(jcr)); /* nothing */) { - bool cancel; - - if (jcr->JobId == 0) { - Dmsg2(200, "Skipping JCR %p (%s) with JobId 0\n", - jcr, jcr->Job); - /* Keep reference counts correct */ - free_locked_jcr(jcr); - continue; - } - - /* check MaxWaitTime */ - cancel = job_check_maxwaittime(control_jcr, jcr); - - /* check MaxRunTime */ - cancel |= job_check_maxruntime(control_jcr, jcr); - - if (cancel) { - Dmsg3(200, "Cancelling JCR %p jobid %d (%s)\n", - jcr, jcr->JobId, jcr->Job); - - UAContext *ua = new_ua_context(jcr); - ua->jcr = control_jcr; - cancel_job(ua, jcr); - free_ua_context(ua); - - Dmsg1(200, "Have cancelled JCR %p\n", jcr); - } - - /* Keep reference counts correct */ - free_locked_jcr(jcr); - } - unlock_jcr_chain(); -} - -/* - * Check if the maxwaittime has expired and it is possible - * to cancel the job. - */ -static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr) -{ - bool cancel = false; - - if (jcr->job->MaxWaitTime == 0) { - return false; - } - if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) { - Dmsg3(200, "Job %p (%s) with MaxWaitTime %d not expired\n", - jcr, jcr->Job, jcr->job->MaxWaitTime); - return false; - } - Dmsg3(200, "Job %d (%s): MaxWaitTime of %d seconds exceeded, " - "checking status\n", - jcr->JobId, jcr->Job, jcr->job->MaxWaitTime); - switch (jcr->JobStatus) { - case JS_Created: - case JS_Blocked: - case JS_WaitFD: - case JS_WaitSD: - case JS_WaitStoreRes: - case JS_WaitClientRes: - case JS_WaitJobRes: - case JS_WaitPriority: - case JS_WaitMaxJobs: - case JS_WaitStartTime: - cancel = true; - Dmsg0(200, "JCR blocked in #1\n"); - break; - case JS_Running: - Dmsg0(200, "JCR running, checking SD status\n"); - switch (jcr->SDJobStatus) { - case JS_WaitMount: - case JS_WaitMedia: - case JS_WaitFD: - cancel = true; - Dmsg0(200, "JCR blocked in #2\n"); - break; - default: - Dmsg0(200, "JCR not blocked in #2\n"); - break; - } - break; - case JS_Terminated: - case JS_ErrorTerminated: - case JS_Canceled: - case JS_FatalError: - Dmsg0(200, "JCR already dead in #3\n"); - break; - default: - Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), - jcr->JobStatus); - } - Dmsg3(200, "MaxWaitTime result: %scancel JCR %p (%s)\n", - cancel ? "" : "do not ", jcr, jcr->job); - - return cancel; -} - -/* - * Check if maxruntime has expired and if the job can be - * canceled. - */ -static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr) -{ - bool cancel = false; - - if (jcr->job->MaxRunTime == 0) { - return false; - } - if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) { - Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n", - jcr, jcr->Job, jcr->job->MaxRunTime); - return false; - } - - switch (jcr->JobStatus) { - case JS_Created: - case JS_Running: - case JS_Blocked: - case JS_WaitFD: - case JS_WaitSD: - case JS_WaitStoreRes: - case JS_WaitClientRes: - case JS_WaitJobRes: - case JS_WaitPriority: - case JS_WaitMaxJobs: - case JS_WaitStartTime: - case JS_Differences: - cancel = true; - break; - case JS_Terminated: - case JS_ErrorTerminated: - case JS_Canceled: - case JS_FatalError: - cancel = false; - break; - default: - Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), - jcr->JobStatus); - } - - Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n", - cancel ? "" : "do not ", jcr, jcr->job); - - return cancel; } /* @@ -246,7 +80,7 @@ void run_job(JCR *jcr) int stat, errstat; P(jcr->mutex); - sm_check(__FILE__, __LINE__, True); + sm_check(__FILE__, __LINE__, true); init_msg(jcr, jcr->messages); create_unique_job_name(jcr, jcr->job->hdr.name); set_jcr_job_status(jcr, JS_Created); @@ -291,7 +125,6 @@ void run_job(JCR *jcr) goto bail_out; } jcr->JobId = jcr->jr.JobId; - ASSERT(jcr->jr.JobId > 0); Dmsg4(50, "Created job record JobId=%d Name=%s Type=%c Level=%c\n", jcr->JobId, jcr->Job, jcr->jr.Type, jcr->jr.Level); @@ -299,7 +132,8 @@ void run_job(JCR *jcr) /* Queue the job to be run */ if ((stat = jobq_add(&job_queue, jcr)) != 0) { - Emsg1(M_ABORT, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat)); + Jmsg(jcr, M_FATAL, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat)); + goto bail_out; } Dmsg0(100, "Done run_job()\n"); @@ -313,73 +147,6 @@ bail_out: } -/* - * Cancel a job -- typically called by the UA (Console program), but may also - * be called by the job watchdog. - * - * Returns: 1 if cancel appears to be successful - * 0 on failure. Message sent to ua->jcr. - */ -int cancel_job(UAContext *ua, JCR *jcr) -{ - BSOCK *sd, *fd; - - switch (jcr->JobStatus) { - case JS_Created: - case JS_WaitJobRes: - case JS_WaitClientRes: - case JS_WaitStoreRes: - case JS_WaitPriority: - case JS_WaitMaxJobs: - case JS_WaitStartTime: - set_jcr_job_status(jcr, JS_Canceled); - bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"), - jcr->JobId, jcr->Job); - jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ - return 1; - - default: - set_jcr_job_status(jcr, JS_Canceled); - - /* Cancel File daemon */ - if (jcr->file_bsock) { - ua->jcr->client = jcr->client; - if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) { - bsendmsg(ua, _("Failed to connect to File daemon.\n")); - return 0; - } - Dmsg0(200, "Connected to file daemon\n"); - fd = ua->jcr->file_bsock; - bnet_fsend(fd, "cancel Job=%s\n", jcr->Job); - while (bnet_recv(fd) >= 0) { - bsendmsg(ua, "%s", fd->msg); - } - bnet_sig(fd, BNET_TERMINATE); - bnet_close(fd); - ua->jcr->file_bsock = NULL; - } - - /* Cancel Storage daemon */ - if (jcr->store_bsock) { - ua->jcr->store = jcr->store; - if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) { - bsendmsg(ua, _("Failed to connect to Storage daemon.\n")); - return 0; - } - Dmsg0(200, "Connected to storage daemon\n"); - sd = ua->jcr->store_bsock; - bnet_fsend(sd, "cancel Job=%s\n", jcr->Job); - while (bnet_recv(sd) >= 0) { - bsendmsg(ua, "%s", sd->msg); - } - bnet_sig(sd, BNET_TERMINATE); - bnet_close(sd); - ua->jcr->store_bsock = NULL; - } - } - - return 1; -} /* * This is the engine called by jobq.c:jobq_add() when we were pulled @@ -391,8 +158,9 @@ static void *job_thread(void *arg) { JCR *jcr = (JCR *)arg; - pthread_detach(pthread_self()); - sm_check(__FILE__, __LINE__, True); + jcr->my_thread_id = pthread_self(); + pthread_detach(jcr->my_thread_id); + sm_check(__FILE__, __LINE__, true); for ( ;; ) { @@ -484,7 +252,7 @@ static void *job_thread(void *arg) */ if (status != 0) { if (jcr->JobStatus == JS_Terminated) { - Jmsg(jcr, M_ERROR, 0, _("RunAfterJob returned non-zero status=%d\n"), + Jmsg(jcr, M_WARNING, 0, _("RunAfterJob returned non-zero status=%d\n"), status); } else { Jmsg(jcr, M_FATAL, 0, _("RunAfterFailedJob returned non-zero status=%d\n"), @@ -492,17 +260,255 @@ static void *job_thread(void *arg) } } } + /* Send off any queued messages */ + if (jcr->msg_queue->size() > 0) { + dequeue_messages(jcr); + } } bail_out: break; } Dmsg0(50, "======== End Job ==========\n"); - sm_check(__FILE__, __LINE__, True); + sm_check(__FILE__, __LINE__, true); return NULL; } +/* + * Cancel a job -- typically called by the UA (Console program), but may also + * be called by the job watchdog. + * + * Returns: 1 if cancel appears to be successful + * 0 on failure. Message sent to ua->jcr. + */ +int cancel_job(UAContext *ua, JCR *jcr) +{ + BSOCK *sd, *fd; + + switch (jcr->JobStatus) { + case JS_Created: + case JS_WaitJobRes: + case JS_WaitClientRes: + case JS_WaitStoreRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + set_jcr_job_status(jcr, JS_Canceled); + bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"), + jcr->JobId, jcr->Job); + jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ + return 1; + + default: + set_jcr_job_status(jcr, JS_Canceled); + + /* Cancel File daemon */ + if (jcr->file_bsock) { + ua->jcr->client = jcr->client; + if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to File daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to file daemon\n"); + fd = ua->jcr->file_bsock; + bnet_fsend(fd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(fd) >= 0) { + bsendmsg(ua, "%s", fd->msg); + } + bnet_sig(fd, BNET_TERMINATE); + bnet_close(fd); + ua->jcr->file_bsock = NULL; + } + + /* Cancel Storage daemon */ + if (jcr->store_bsock) { + ua->jcr->store = jcr->store; + if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to Storage daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to storage daemon\n"); + sd = ua->jcr->store_bsock; + bnet_fsend(sd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(sd) >= 0) { + bsendmsg(ua, "%s", sd->msg); + } + bnet_sig(sd, BNET_TERMINATE); + bnet_close(sd); + ua->jcr->store_bsock = NULL; + } + } + + return 1; +} + + +static void job_monitor_destructor(watchdog_t *self) +{ + JCR *control_jcr = (JCR *) self->data; + + free_jcr(control_jcr); +} + +static void job_monitor_watchdog(watchdog_t *self) +{ + JCR *control_jcr, *jcr; + + control_jcr = (JCR *)self->data; + + Dmsg1(400, "job_monitor_watchdog %p called\n", self); + + lock_jcr_chain(); + + foreach_jcr(jcr) { + bool cancel; + + if (jcr->JobId == 0) { + Dmsg2(400, "Skipping JCR %p (%s) with JobId 0\n", + jcr, jcr->Job); + /* Keep reference counts correct */ + free_locked_jcr(jcr); + continue; + } + + /* check MaxWaitTime */ + cancel = job_check_maxwaittime(control_jcr, jcr); + + /* check MaxRunTime */ + cancel |= job_check_maxruntime(control_jcr, jcr); + + if (cancel) { + Dmsg3(200, "Cancelling JCR %p jobid %d (%s)\n", + jcr, jcr->JobId, jcr->Job); + + UAContext *ua = new_ua_context(jcr); + ua->jcr = control_jcr; + cancel_job(ua, jcr); + free_ua_context(ua); + + Dmsg1(200, "Have cancelled JCR %p\n", jcr); + } + + /* Keep reference counts correct */ + free_locked_jcr(jcr); + } + unlock_jcr_chain(); +} + +/* + * Check if the maxwaittime has expired and it is possible + * to cancel the job. + */ +static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr) +{ + bool cancel = false; + + if (jcr->job->MaxWaitTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) { + Dmsg3(200, "Job %p (%s) with MaxWaitTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxWaitTime); + return false; + } + Dmsg3(200, "Job %d (%s): MaxWaitTime of %d seconds exceeded, " + "checking status\n", + jcr->JobId, jcr->Job, jcr->job->MaxWaitTime); + switch (jcr->JobStatus) { + case JS_Created: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + cancel = true; + Dmsg0(200, "JCR blocked in #1\n"); + break; + case JS_Running: + Dmsg0(200, "JCR running, checking SD status\n"); + switch (jcr->SDJobStatus) { + case JS_WaitMount: + case JS_WaitMedia: + case JS_WaitFD: + cancel = true; + Dmsg0(200, "JCR blocked in #2\n"); + break; + default: + Dmsg0(200, "JCR not blocked in #2\n"); + break; + } + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + Dmsg0(200, "JCR already dead in #3\n"); + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); + } + Dmsg3(200, "MaxWaitTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; +} + +/* + * Check if maxruntime has expired and if the job can be + * canceled. + */ +static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr) +{ + bool cancel = false; + + if (jcr->job->MaxRunTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) { + Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxRunTime); + return false; + } + + switch (jcr->JobStatus) { + case JS_Created: + case JS_Running: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + case JS_Differences: + cancel = true; + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + cancel = false; + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); + } + + Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; +} + + /* * Get or create a Client record for this Job */ @@ -686,6 +692,7 @@ void set_jcr_defaults(JCR *jcr, JOB *job) jcr->catalog = job->client->catalog; jcr->fileset = job->fileset; jcr->messages = job->messages; + jcr->spool_data = job->spool_data; if (jcr->RestoreBootstrap) { free(jcr->RestoreBootstrap); jcr->RestoreBootstrap = NULL;