X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=bacula%2Fsrc%2Fdird%2Fjob.c;h=6c486ae54524c1f66508c7acbf7d16c695321d63;hb=b8b2ed2a6db4fb8436647d438185a364951375fc;hp=1d3fd12096251f74e4319a9930eca6ebddcf410b;hpb=05ff9f2cd22324a6c5e141809cf6b26b89edeeb0;p=bacula%2Fbacula diff --git a/bacula/src/dird/job.c b/bacula/src/dird/job.c index 1d3fd12096..6c486ae545 100644 --- a/bacula/src/dird/job.c +++ b/bacula/src/dird/job.c @@ -7,7 +7,7 @@ * Version $Id$ */ /* - Copyright (C) 2000-2003 Kern Sibbald and John Walker + Copyright (C) 2000-2004 Kern Sibbald and John Walker This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as @@ -59,172 +59,19 @@ void init_job_server(int max_workers) if ((stat = jobq_init(&job_queue, max_workers, job_thread)) != 0) { Emsg1(M_ABORT, 0, _("Could not init job queue: ERR=%s\n"), strerror(stat)); } - if ((wd = watchdog_new()) == NULL) { + if ((wd = new_watchdog()) == NULL) { Emsg0(M_ABORT, 0, _("Could not init job monitor watchdogs\n")); } wd->callback = job_monitor_watchdog; wd->destructor = job_monitor_destructor; wd->one_shot = false; wd->interval = 60; - wd->data = create_control_jcr("*JobMonitor*", JT_SYSTEM); + wd->data = new_control_jcr("*JobMonitor*", JT_SYSTEM); register_watchdog(wd); return; } -static void job_monitor_destructor(watchdog_t *self) -{ - JCR *control_jcr = (JCR *) self->data; - - free_jcr(control_jcr); -} - -static void job_monitor_watchdog(watchdog_t *self) -{ - JCR *control_jcr, *jcr; - - control_jcr = (JCR *) self->data; - - Dmsg1(200, "job_monitor_watchdog %p called\n", self); - - lock_jcr_chain(); - - for (jcr = NULL; (jcr = get_next_jcr(jcr)); /* nothing */) { - bool cancel; - - if (jcr->JobId == 0) { - Dmsg2(200, "Skipping JCR %p (%s) with JobId 0\n", - jcr, jcr->Job); - /* Keep reference counts correct */ - free_locked_jcr(jcr); - continue; - } - - /* check MaxWaitTime */ - cancel = job_check_maxwaittime(control_jcr, jcr); - - /* check MaxRunTime */ - cancel |= job_check_maxruntime(control_jcr, jcr); - - if (cancel) { - Dmsg3(200, "Cancelling JCR %p jobid %d (%s)\n", - jcr, jcr->JobId, jcr->Job); - - UAContext *ua = new_ua_context(jcr); - ua->jcr = control_jcr; - cancel_job(ua, jcr); - free_ua_context(ua); - - Dmsg1(200, "Have cancelled JCR %p\n", jcr); - } - - /* Keep reference counts correct */ - free_locked_jcr(jcr); - } - unlock_jcr_chain(); -} - -static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr) -{ - bool cancel = false; - - if (jcr->job->MaxWaitTime == 0) { - return false; - } - if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) { - Dmsg3(200, "Job %p (%s) with MaxWaitTime %d not expired\n", - jcr, jcr->Job, jcr->job->MaxWaitTime); - return false; - } - Dmsg3(200, "Job %d (%s): MaxWaitTime of %d seconds exceeded, " - "checking status\n", - jcr->JobId, jcr->Job, jcr->job->MaxWaitTime); - switch (jcr->JobStatus) { - case JS_Created: - case JS_Blocked: - case JS_WaitFD: - case JS_WaitSD: - case JS_WaitStoreRes: - case JS_WaitClientRes: - case JS_WaitJobRes: - case JS_WaitPriority: - case JS_WaitMaxJobs: - case JS_WaitStartTime: - cancel = true; - Dmsg0(200, "JCR blocked in #1\n"); - break; - case JS_Running: - Dmsg0(200, "JCR running, checking SD status\n"); - switch (jcr->SDJobStatus) { - case JS_WaitMount: - case JS_WaitMedia: - case JS_WaitFD: - cancel = true; - Dmsg0(200, "JCR blocked in #2\n"); - break; - default: - Dmsg0(200, "JCR not blocked in #2\n"); - break; - } - break; - case JS_Terminated: - case JS_ErrorTerminated: - case JS_Canceled: - Dmsg0(200, "JCR already dead in #3\n"); - break; - default: - Emsg1(M_ABORT, 0, _("Unhandled job status code %d\n"), - jcr->JobStatus); - } - Dmsg3(200, "MaxWaitTime result: %scancel JCR %p (%s)\n", - cancel ? "" : "do not ", jcr, jcr->job); - - return cancel; -} - -static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr) -{ - bool cancel = false; - - if (jcr->job->MaxRunTime == 0) { - return false; - } - if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) { - Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n", - jcr, jcr->Job, jcr->job->MaxRunTime); - return false; - } - - switch (jcr->JobStatus) { - case JS_Created: - case JS_Blocked: - case JS_WaitFD: - case JS_WaitSD: - case JS_WaitStoreRes: - case JS_WaitClientRes: - case JS_WaitJobRes: - case JS_WaitPriority: - case JS_WaitMaxJobs: - case JS_WaitStartTime: - case JS_Running: - cancel = true; - break; - case JS_Terminated: - case JS_ErrorTerminated: - case JS_Canceled: - cancel = false; - break; - default: - Emsg1(M_ABORT, 0, _("Unhandled job status code %d\n"), - jcr->JobStatus); - } - - Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n", - cancel ? "" : "do not ", jcr, jcr->job); - - return cancel; -} - /* * Run a job -- typically called by the scheduler, but may also * be called by the UA (Console program). @@ -280,7 +127,6 @@ void run_job(JCR *jcr) goto bail_out; } jcr->JobId = jcr->jr.JobId; - ASSERT(jcr->jr.JobId > 0); Dmsg4(50, "Created job record JobId=%d Name=%s Type=%c Level=%c\n", jcr->JobId, jcr->Job, jcr->jr.Type, jcr->jr.Level); @@ -288,7 +134,8 @@ void run_job(JCR *jcr) /* Queue the job to be run */ if ((stat = jobq_add(&job_queue, jcr)) != 0) { - Emsg1(M_ABORT, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat)); + Jmsg(jcr, M_FATAL, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat)); + goto bail_out; } Dmsg0(100, "Done run_job()\n"); @@ -302,73 +149,6 @@ bail_out: } -/* - * Cancel a job -- typically called by the UA (Console program), but may also - * be called by the job watchdog. - * - * Returns: 1 if cancel appears to be successful - * 0 on failure. Message sent to ua->jcr. - */ -int cancel_job(UAContext *ua, JCR *jcr) -{ - BSOCK *sd, *fd; - - switch (jcr->JobStatus) { - case JS_Created: - case JS_WaitJobRes: - case JS_WaitClientRes: - case JS_WaitStoreRes: - case JS_WaitPriority: - case JS_WaitMaxJobs: - case JS_WaitStartTime: - set_jcr_job_status(jcr, JS_Canceled); - bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"), - jcr->JobId, jcr->Job); - jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ - return 1; - - default: - set_jcr_job_status(jcr, JS_Canceled); - - /* Cancel File daemon */ - if (jcr->file_bsock) { - ua->jcr->client = jcr->client; - if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) { - bsendmsg(ua, _("Failed to connect to File daemon.\n")); - return 0; - } - Dmsg0(200, "Connected to file daemon\n"); - fd = ua->jcr->file_bsock; - bnet_fsend(fd, "cancel Job=%s\n", jcr->Job); - while (bnet_recv(fd) >= 0) { - bsendmsg(ua, "%s", fd->msg); - } - bnet_sig(fd, BNET_TERMINATE); - bnet_close(fd); - ua->jcr->file_bsock = NULL; - } - - /* Cancel Storage daemon */ - if (jcr->store_bsock) { - ua->jcr->store = jcr->store; - if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) { - bsendmsg(ua, _("Failed to connect to Storage daemon.\n")); - return 0; - } - Dmsg0(200, "Connected to storage daemon\n"); - sd = ua->jcr->store_bsock; - bnet_fsend(sd, "cancel Job=%s\n", jcr->Job); - while (bnet_recv(sd) >= 0) { - bsendmsg(ua, "%s", sd->msg); - } - bnet_sig(sd, BNET_TERMINATE); - bnet_close(sd); - ua->jcr->store_bsock = NULL; - } - } - - return 1; -} /* * This is the engine called by jobq.c:jobq_add() when we were pulled @@ -467,18 +247,24 @@ static void *job_thread(void *arg) Jmsg(jcr, M_INFO, 0, _("RunAfter: %s"), line); } status = close_bpipe(bpipe); + /* + * Note, if we get an error here, do not mark the + * job in error, simply report the error condition. + */ if (status != 0) { if (jcr->JobStatus == JS_Terminated) { - Jmsg(jcr, M_FATAL, 0, _("RunAfterJob returned non-zero status=%d\n"), + Jmsg(jcr, M_WARNING, 0, _("RunAfterJob returned non-zero status=%d\n"), status); } else { Jmsg(jcr, M_FATAL, 0, _("RunAfterFailedJob returned non-zero status=%d\n"), status); } - set_jcr_job_status(jcr, JS_FatalError); - update_job_end_record(jcr); } } + /* Send off any queued messages */ + if (jcr->msg_queue->size() > 0) { + dequeue_messages(jcr); + } } bail_out: break; @@ -490,6 +276,240 @@ bail_out: } +/* + * Cancel a job -- typically called by the UA (Console program), but may also + * be called by the job watchdog. + * + * Returns: 1 if cancel appears to be successful + * 0 on failure. Message sent to ua->jcr. + */ +int cancel_job(UAContext *ua, JCR *jcr) +{ + BSOCK *sd, *fd; + + switch (jcr->JobStatus) { + case JS_Created: + case JS_WaitJobRes: + case JS_WaitClientRes: + case JS_WaitStoreRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + set_jcr_job_status(jcr, JS_Canceled); + bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"), + jcr->JobId, jcr->Job); + jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ + return 1; + + default: + set_jcr_job_status(jcr, JS_Canceled); + + /* Cancel File daemon */ + if (jcr->file_bsock) { + ua->jcr->client = jcr->client; + if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to File daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to file daemon\n"); + fd = ua->jcr->file_bsock; + bnet_fsend(fd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(fd) >= 0) { + bsendmsg(ua, "%s", fd->msg); + } + bnet_sig(fd, BNET_TERMINATE); + bnet_close(fd); + ua->jcr->file_bsock = NULL; + } + + /* Cancel Storage daemon */ + if (jcr->store_bsock) { + ua->jcr->store = jcr->store; + if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to Storage daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to storage daemon\n"); + sd = ua->jcr->store_bsock; + bnet_fsend(sd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(sd) >= 0) { + bsendmsg(ua, "%s", sd->msg); + } + bnet_sig(sd, BNET_TERMINATE); + bnet_close(sd); + ua->jcr->store_bsock = NULL; + } + } + + return 1; +} + + +static void job_monitor_destructor(watchdog_t *self) +{ + JCR *control_jcr = (JCR *) self->data; + + free_jcr(control_jcr); +} + +static void job_monitor_watchdog(watchdog_t *self) +{ + JCR *control_jcr, *jcr; + + control_jcr = (JCR *)self->data; + + Dmsg1(400, "job_monitor_watchdog %p called\n", self); + + lock_jcr_chain(); + + foreach_jcr(jcr) { + bool cancel; + + if (jcr->JobId == 0) { + Dmsg2(400, "Skipping JCR %p (%s) with JobId 0\n", + jcr, jcr->Job); + /* Keep reference counts correct */ + free_locked_jcr(jcr); + continue; + } + + /* check MaxWaitTime */ + cancel = job_check_maxwaittime(control_jcr, jcr); + + /* check MaxRunTime */ + cancel |= job_check_maxruntime(control_jcr, jcr); + + if (cancel) { + Dmsg3(200, "Cancelling JCR %p jobid %d (%s)\n", + jcr, jcr->JobId, jcr->Job); + + UAContext *ua = new_ua_context(jcr); + ua->jcr = control_jcr; + cancel_job(ua, jcr); + free_ua_context(ua); + + Dmsg1(200, "Have cancelled JCR %p\n", jcr); + } + + /* Keep reference counts correct */ + free_locked_jcr(jcr); + } + unlock_jcr_chain(); +} + +/* + * Check if the maxwaittime has expired and it is possible + * to cancel the job. + */ +static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr) +{ + bool cancel = false; + + if (jcr->job->MaxWaitTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) { + Dmsg3(200, "Job %p (%s) with MaxWaitTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxWaitTime); + return false; + } + Dmsg3(200, "Job %d (%s): MaxWaitTime of %d seconds exceeded, " + "checking status\n", + jcr->JobId, jcr->Job, jcr->job->MaxWaitTime); + switch (jcr->JobStatus) { + case JS_Created: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + cancel = true; + Dmsg0(200, "JCR blocked in #1\n"); + break; + case JS_Running: + Dmsg0(200, "JCR running, checking SD status\n"); + switch (jcr->SDJobStatus) { + case JS_WaitMount: + case JS_WaitMedia: + case JS_WaitFD: + cancel = true; + Dmsg0(200, "JCR blocked in #2\n"); + break; + default: + Dmsg0(200, "JCR not blocked in #2\n"); + break; + } + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + Dmsg0(200, "JCR already dead in #3\n"); + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); + } + Dmsg3(200, "MaxWaitTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; +} + +/* + * Check if maxruntime has expired and if the job can be + * canceled. + */ +static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr) +{ + bool cancel = false; + + if (jcr->job->MaxRunTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) { + Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxRunTime); + return false; + } + + switch (jcr->JobStatus) { + case JS_Created: + case JS_Running: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + case JS_Differences: + cancel = true; + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + cancel = false; + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); + } + + Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; +} + + /* * Get or create a Client record for this Job */ @@ -667,9 +687,13 @@ void set_jcr_defaults(JCR *jcr, JOB *job) } pm_strcpy(&jcr->client_name, jcr->client->hdr.name); jcr->pool = job->pool; + jcr->full_pool = job->full_pool; + jcr->inc_pool = job->inc_pool; + jcr->dif_pool = job->dif_pool; jcr->catalog = job->client->catalog; jcr->fileset = job->fileset; jcr->messages = job->messages; + jcr->spool_data = job->spool_data; if (jcr->RestoreBootstrap) { free(jcr->RestoreBootstrap); jcr->RestoreBootstrap = NULL;