From 996cf58ceaad06bf9fa889bc3c4397c118bb69fe Mon Sep 17 00:00:00 2001 From: Kern Sibbald Date: Sat, 21 Jun 2008 16:18:35 +0000 Subject: [PATCH] kes Apply duplicate job tests to restarted jobs. kes Copy more data when restarting a job so that run overrides are kept. This should fix bug #1094. git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@7207 91ce42f0-d328-0410-95d8-f526ca767f89 --- bacula/src/dird/job.c | 5 +- bacula/src/dird/jobq.c | 159 ++++++++++++++++++++++++----------------- bacula/technotes-2.5 | 4 ++ 3 files changed, 99 insertions(+), 69 deletions(-) diff --git a/bacula/src/dird/job.c b/bacula/src/dird/job.c index cf3c1da504..0c735578dd 100644 --- a/bacula/src/dird/job.c +++ b/bacula/src/dird/job.c @@ -645,6 +645,7 @@ bool allow_duplicate_job(JCR *jcr) } if (!job->AllowHigherDuplicates) { foreach_jcr(djcr) { + char ec1[50]; if (strcmp(job->name(), djcr->job->name()) == 0) { bool cancel_queued = false; if (job->DuplicateJobProximity > 0) { @@ -656,7 +657,8 @@ bool allow_duplicate_job(JCR *jcr) /* Cancel */ if (!(job->CancelQueuedDuplicates || job->CancelRunningDuplicates)) { /* Zap current job */ - Jmsg(jcr, M_FATAL, 0, _("Duplicate job not allowed.\n")); + Jmsg(jcr, M_FATAL, 0, _("Duplicate job not allowed. JobId=%s\n"), + edit_uint64(djcr->JobId, ec1)); return false; } /* If CancelQueuedDuplicates is set do so only if job is queued */ @@ -677,7 +679,6 @@ bool allow_duplicate_job(JCR *jcr) } if (cancel_queued || job->CancelRunningDuplicates) { UAContext *ua = new_ua_context(djcr); - char ec1[50]; Jmsg(jcr, M_INFO, 0, _("Cancelling duplicate JobId=%s.\n"), edit_uint64(djcr->JobId, ec1)); ua->jcr = djcr; diff --git a/bacula/src/dird/jobq.c b/bacula/src/dird/jobq.c index a46561aa62..af8605268d 100644 --- a/bacula/src/dird/jobq.c +++ b/bacula/src/dird/jobq.c @@ -56,6 +56,7 @@ extern "C" void *sched_wait(void *arg); static int start_server(jobq_t *jq); static bool acquire_resources(JCR *jcr); +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je); static void dec_read_store(JCR *jcr); static void dec_write_store(JCR *jcr); @@ -484,74 +485,10 @@ void *jobq_server(void *arg) jcr->acquired_resource_locks = false; } - /* - * Reschedule the job if necessary and requested - */ - if (jcr->job->RescheduleOnError && - jcr->JobStatus != JS_Terminated && - jcr->JobStatus != JS_Canceled && - jcr->JobType == JT_BACKUP && - (jcr->job->RescheduleTimes == 0 || - jcr->reschedule_count < jcr->job->RescheduleTimes)) { - char dt[50], dt2[50]; - - /* - * Reschedule this job by cleaning it up, but - * reuse the same JobId if possible. - */ - time_t now = time(NULL); - jcr->reschedule_count++; - jcr->sched_time = now + jcr->job->RescheduleInterval; - bstrftime(dt, sizeof(dt), now); - bstrftime(dt2, sizeof(dt2), jcr->sched_time); - Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job, - (int)jcr->job->RescheduleInterval, now, jcr->sched_time); - Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"), - jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2); - dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */ - jcr->JobStatus = -1; - set_jcr_job_status(jcr, JS_WaitStartTime); - jcr->SDJobStatus = 0; - if (jcr->JobBytes == 0) { - Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count()); - V(jq->mutex); - jobq_add(jq, jcr); /* queue the job to run again */ - P(jq->mutex); - free_jcr(jcr); /* release jcr */ - free(je); /* free the job entry */ - continue; /* look for another job to run */ - } - /* - * Something was actually backed up, so we cannot reuse - * the old JobId or there will be database record - * conflicts. We now create a new job, copying the - * appropriate fields. - */ - JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr); - set_jcr_defaults(njcr, jcr->job); - njcr->reschedule_count = jcr->reschedule_count; - njcr->sched_time = jcr->sched_time; - njcr->JobLevel = jcr->JobLevel; - njcr->JobStatus = -1; - set_jcr_job_status(njcr, jcr->JobStatus); - if (jcr->rstore) { - copy_rstorage(njcr, jcr->rstorage, _("previous Job")); - } else { - free_rstorage(njcr); - } - if (jcr->wstore) { - copy_wstorage(njcr, jcr->wstorage, _("previous Job")); - } else { - free_wstorage(njcr); - } - njcr->messages = jcr->messages; - Dmsg0(2300, "Call to run new job\n"); - V(jq->mutex); - run_job(njcr); /* This creates a "new" job */ - free_jcr(njcr); /* release "new" jcr */ - P(jq->mutex); - Dmsg0(2300, "Back from running new job.\n"); + if (reschedule_job(jcr, jq, je)) { + continue; /* go look for more work */ } + /* Clean up and release old jcr */ Dmsg2(2300, "====== Termination job=%d use_cnt=%d\n", jcr->JobId, jcr->use_count()); jcr->SDJobStatus = 0; @@ -664,6 +601,94 @@ void *jobq_server(void *arg) return NULL; } +/* + * Returns true if cleanup done and we should look for more work + */ +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je) +{ + /* + * Reschedule the job if necessary and requested + */ + if (jcr->job->RescheduleOnError && + jcr->JobStatus != JS_Terminated && + jcr->JobStatus != JS_Canceled && + jcr->JobType == JT_BACKUP && + (jcr->job->RescheduleTimes == 0 || + jcr->reschedule_count < jcr->job->RescheduleTimes)) { + char dt[50], dt2[50]; + + /* + * Reschedule this job by cleaning it up, but + * reuse the same JobId if possible. + */ + time_t now = time(NULL); + jcr->reschedule_count++; + jcr->sched_time = now + jcr->job->RescheduleInterval; + bstrftime(dt, sizeof(dt), now); + bstrftime(dt2, sizeof(dt2), jcr->sched_time); + Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job, + (int)jcr->job->RescheduleInterval, now, jcr->sched_time); + Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"), + jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2); + dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */ + jcr->JobStatus = -1; + set_jcr_job_status(jcr, JS_WaitStartTime); + jcr->SDJobStatus = 0; + if (!allow_duplicate_job(jcr)) { + return false; + } + if (jcr->JobBytes == 0) { + Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count()); + V(jq->mutex); + jobq_add(jq, jcr); /* queue the job to run again */ + P(jq->mutex); + free_jcr(jcr); /* release jcr */ + free(je); /* free the job entry */ + return true; /* we already cleaned up */ + } + /* + * Something was actually backed up, so we cannot reuse + * the old JobId or there will be database record + * conflicts. We now create a new job, copying the + * appropriate fields. + */ + JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr); + set_jcr_defaults(njcr, jcr->job); + njcr->reschedule_count = jcr->reschedule_count; + njcr->sched_time = jcr->sched_time; + njcr->JobLevel = jcr->JobLevel; + njcr->pool = jcr->pool; + njcr->run_pool_override = jcr->run_pool_override; + njcr->full_pool = jcr->full_pool; + njcr->run_full_pool_override = jcr->run_full_pool_override; + njcr->inc_pool = jcr->inc_pool; + njcr->run_inc_pool_override = jcr->run_inc_pool_override; + njcr->diff_pool = jcr->diff_pool; + njcr->JobStatus = -1; + set_jcr_job_status(njcr, jcr->JobStatus); + if (jcr->rstore) { + copy_rstorage(njcr, jcr->rstorage, _("previous Job")); + } else { + free_rstorage(njcr); + } + if (jcr->wstore) { + copy_wstorage(njcr, jcr->wstorage, _("previous Job")); + } else { + free_wstorage(njcr); + } + njcr->messages = jcr->messages; + njcr->spool_data = jcr->spool_data; + njcr->write_part_after_job = jcr->write_part_after_job; + Dmsg0(2300, "Call to run new job\n"); + V(jq->mutex); + run_job(njcr); /* This creates a "new" job */ + free_jcr(njcr); /* release "new" jcr */ + P(jq->mutex); + Dmsg0(2300, "Back from running new job.\n"); + } + return false; +} + /* * See if we can acquire all the necessary resources for the job (JCR) * diff --git a/bacula/technotes-2.5 b/bacula/technotes-2.5 index 85c7650445..8eadc84422 100644 --- a/bacula/technotes-2.5 +++ b/bacula/technotes-2.5 @@ -30,6 +30,10 @@ vtape driver General: +21Jun08 +kes Apply duplicate job tests to restarted jobs. +kes Copy more data when restarting a job so that run + overrides are kept. This should fix bug #1094. 20Jun08 kes More word alignment cleanup. kes Fix bug where SD did not ask operator if the device could not -- 2.39.5