From: Kern Sibbald Date: Sun, 22 Jun 2008 13:25:55 +0000 (+0000) Subject: kes Copy more data when restarting a job so that run X-Git-Tag: Release-2.4.1~42 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=4c7cb330f5ca1f21236f1aee6e0d46a4bc91fed8;p=bacula%2Fbacula kes Copy more data when restarting a job so that run overrides are kept. This should fix bug #1094. kes Backport updates to jobq.c where possible. git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/branches/Branch-2.4@7216 91ce42f0-d328-0410-95d8-f526ca767f89 --- diff --git a/bacula/src/dird/jobq.c b/bacula/src/dird/jobq.c index 0f3df014b1..eb43ffee99 100644 --- a/bacula/src/dird/jobq.c +++ b/bacula/src/dird/jobq.c @@ -1,7 +1,7 @@ /* Bacula® - The Network Backup Solution - Copyright (C) 2003-2007 Free Software Foundation Europe e.V. + Copyright (C) 2003-2008 Free Software Foundation Europe e.V. The main author of Bacula is Kern Sibbald, with contributions from many others, a complete list can be found in the file AUTHORS. @@ -56,8 +56,9 @@ extern "C" void *sched_wait(void *arg); static int start_server(jobq_t *jq); static bool acquire_resources(JCR *jcr); - - +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je); +static void dec_read_store(JCR *jcr); +static void dec_write_store(JCR *jcr); /* * Initialize a job queue @@ -357,8 +358,8 @@ static int start_server(jobq_t *jq) pthread_t id; /* - * if any threads are idle, wake one -- - * actually we do a broadcast because on /lib/tls + * if any threads are idle, wake one. + * Actually we do a broadcast because on /lib/tls * these signals seem to get lost from time to time. */ if (jq->idle_workers > 0) { @@ -477,87 +478,17 @@ void *jobq_server(void *arg) * put into the ready queue. */ if (jcr->acquired_resource_locks) { - if (jcr->rstore) { - jcr->rstore->NumConcurrentJobs--; - Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); - } - if (jcr->wstore) { - jcr->wstore->NumConcurrentJobs--; - Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); - } + dec_read_store(jcr); + dec_write_store(jcr); jcr->client->NumConcurrentJobs--; jcr->job->NumConcurrentJobs--; jcr->acquired_resource_locks = false; } - /* - * Reschedule the job if necessary and requested - */ - if (jcr->job->RescheduleOnError && - jcr->JobStatus != JS_Terminated && - jcr->JobStatus != JS_Canceled && - jcr->JobType == JT_BACKUP && - (jcr->job->RescheduleTimes == 0 || - jcr->reschedule_count < jcr->job->RescheduleTimes)) { - char dt[50], dt2[50]; - - /* - * Reschedule this job by cleaning it up, but - * reuse the same JobId if possible. - */ - time_t now = time(NULL); - jcr->reschedule_count++; - jcr->sched_time = now + jcr->job->RescheduleInterval; - bstrftime(dt, sizeof(dt), now); - bstrftime(dt2, sizeof(dt2), jcr->sched_time); - Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job, - (int)jcr->job->RescheduleInterval, now, jcr->sched_time); - Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"), - jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2); - dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */ - jcr->JobStatus = -1; - set_jcr_job_status(jcr, JS_WaitStartTime); - jcr->SDJobStatus = 0; - if (jcr->JobBytes == 0) { - Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count()); - V(jq->mutex); - jobq_add(jq, jcr); /* queue the job to run again */ - P(jq->mutex); - free_jcr(jcr); /* release jcr */ - free(je); /* free the job entry */ - continue; /* look for another job to run */ - } - /* - * Something was actually backed up, so we cannot reuse - * the old JobId or there will be database record - * conflicts. We now create a new job, copying the - * appropriate fields. - */ - JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr); - set_jcr_defaults(njcr, jcr->job); - njcr->reschedule_count = jcr->reschedule_count; - njcr->sched_time = jcr->sched_time; - njcr->JobLevel = jcr->JobLevel; - njcr->JobStatus = -1; - set_jcr_job_status(njcr, jcr->JobStatus); - if (jcr->rstore) { - copy_rstorage(njcr, jcr->rstorage, _("previous Job")); - } else { - free_rstorage(njcr); - } - if (jcr->wstore) { - copy_wstorage(njcr, jcr->wstorage, _("previous Job")); - } else { - free_wstorage(njcr); - } - njcr->messages = jcr->messages; - Dmsg0(2300, "Call to run new job\n"); - V(jq->mutex); - run_job(njcr); /* This creates a "new" job */ - free_jcr(njcr); /* release "new" jcr */ - P(jq->mutex); - Dmsg0(2300, "Back from running new job.\n"); + if (reschedule_job(jcr, jq, je)) { + continue; /* go look for more work */ } + /* Clean up and release old jcr */ Dmsg2(2300, "====== Termination job=%d use_cnt=%d\n", jcr->JobId, jcr->use_count()); jcr->SDJobStatus = 0; @@ -670,6 +601,91 @@ void *jobq_server(void *arg) return NULL; } +/* + * Returns true if cleanup done and we should look for more work + */ +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je) +{ + /* + * Reschedule the job if necessary and requested + */ + if (jcr->job->RescheduleOnError && + jcr->JobStatus != JS_Terminated && + jcr->JobStatus != JS_Canceled && + jcr->JobType == JT_BACKUP && + (jcr->job->RescheduleTimes == 0 || + jcr->reschedule_count < jcr->job->RescheduleTimes)) { + char dt[50], dt2[50]; + + /* + * Reschedule this job by cleaning it up, but + * reuse the same JobId if possible. + */ + time_t now = time(NULL); + jcr->reschedule_count++; + jcr->sched_time = now + jcr->job->RescheduleInterval; + bstrftime(dt, sizeof(dt), now); + bstrftime(dt2, sizeof(dt2), jcr->sched_time); + Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job, + (int)jcr->job->RescheduleInterval, now, jcr->sched_time); + Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"), + jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2); + dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */ + jcr->JobStatus = -1; + set_jcr_job_status(jcr, JS_WaitStartTime); + jcr->SDJobStatus = 0; + if (jcr->JobBytes == 0) { + Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count()); + V(jq->mutex); + jobq_add(jq, jcr); /* queue the job to run again */ + P(jq->mutex); + free_jcr(jcr); /* release jcr */ + free(je); /* free the job entry */ + return true; /* we already cleaned up */ + } + /* + * Something was actually backed up, so we cannot reuse + * the old JobId or there will be database record + * conflicts. We now create a new job, copying the + * appropriate fields. + */ + JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr); + set_jcr_defaults(njcr, jcr->job); + njcr->reschedule_count = jcr->reschedule_count; + njcr->sched_time = jcr->sched_time; + njcr->JobLevel = jcr->JobLevel; + njcr->pool = jcr->pool; + njcr->run_pool_override = jcr->run_pool_override; + njcr->full_pool = jcr->full_pool; + njcr->run_full_pool_override = jcr->run_full_pool_override; + njcr->inc_pool = jcr->inc_pool; + njcr->run_inc_pool_override = jcr->run_inc_pool_override; + njcr->diff_pool = jcr->diff_pool; + njcr->JobStatus = -1; + set_jcr_job_status(njcr, jcr->JobStatus); + if (jcr->rstore) { + copy_rstorage(njcr, jcr->rstorage, _("previous Job")); + } else { + free_rstorage(njcr); + } + if (jcr->wstore) { + copy_wstorage(njcr, jcr->wstorage, _("previous Job")); + } else { + free_wstorage(njcr); + } + njcr->messages = jcr->messages; + njcr->spool_data = jcr->spool_data; + njcr->write_part_after_job = jcr->write_part_after_job; + Dmsg0(2300, "Call to run new job\n"); + V(jq->mutex); + run_job(njcr); /* This creates a "new" job */ + free_jcr(njcr); /* release "new" jcr */ + P(jq->mutex); + Dmsg0(2300, "Back from running new job.\n"); + } + return false; +} + /* * See if we can acquire all the necessary resources for the job (JCR) * @@ -681,11 +697,19 @@ static bool acquire_resources(JCR *jcr) bool skip_this_jcr = false; jcr->acquired_resource_locks = false; + if (jcr->rstore == jcr->wstore) { /* deadlock */ + Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n" + " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), + jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source); + set_jcr_job_status(jcr, JS_Canceled); + return false; + } if (jcr->rstore) { Dmsg1(200, "Rstore=%s\n", jcr->rstore->name()); if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { +// jcr->rstore->NumConcurrentReadJobs++; jcr->rstore->NumConcurrentJobs++; - Dmsg0(200, "Set rncj=1\n"); + Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs); } else { Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs); set_jcr_job_status(jcr, JS_WaitStoreRes); @@ -695,25 +719,11 @@ static bool acquire_resources(JCR *jcr) if (jcr->wstore) { Dmsg1(200, "Wstore=%s\n", jcr->wstore->name()); - if (jcr->rstore == jcr->wstore) { /* deadlock */ - jcr->rstore->NumConcurrentJobs--; /* back out rstore */ - Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n" - " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), - jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source); - set_jcr_job_status(jcr, JS_Canceled); - return false; - } - if (jcr->wstore->NumConcurrentJobs == 0 && - jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) { - /* Simple case, first job */ - jcr->wstore->NumConcurrentJobs = 1; - Dmsg0(200, "Set wncj=1\n"); - } else if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) { + if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) { jcr->wstore->NumConcurrentJobs++; Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs); } else if (jcr->rstore) { - jcr->rstore->NumConcurrentJobs--; /* back out rstore */ - Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs); + dec_read_store(jcr); skip_this_jcr = true; } else { Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs); @@ -729,14 +739,8 @@ static bool acquire_resources(JCR *jcr) jcr->client->NumConcurrentJobs++; } else { /* Back out previous locks */ - if (jcr->wstore) { - jcr->wstore->NumConcurrentJobs--; - Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); - } - if (jcr->rstore) { - jcr->rstore->NumConcurrentJobs--; - Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); - } + dec_write_store(jcr); + dec_read_store(jcr); set_jcr_job_status(jcr, JS_WaitClientRes); return false; } @@ -744,14 +748,8 @@ static bool acquire_resources(JCR *jcr) jcr->job->NumConcurrentJobs++; } else { /* Back out previous locks */ - if (jcr->wstore) { - jcr->wstore->NumConcurrentJobs--; - Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); - } - if (jcr->rstore) { - jcr->rstore->NumConcurrentJobs--; - Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); - } + dec_write_store(jcr); + dec_read_store(jcr); jcr->client->NumConcurrentJobs--; set_jcr_job_status(jcr, JS_WaitJobRes); return false; @@ -760,3 +758,23 @@ static bool acquire_resources(JCR *jcr) jcr->acquired_resource_locks = true; return true; } + +static void dec_read_store(JCR *jcr) +{ + if (jcr->rstore) { +// jcr->rstore->NumConcurrentReadJobs--; /* back out rstore */ + jcr->rstore->NumConcurrentJobs--; /* back out rstore */ + Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); +// ASSERT(jcr->rstore->NumConcurrentReadJobs >= 0); + ASSERT(jcr->rstore->NumConcurrentJobs >= 0); + } +} + +static void dec_write_store(JCR *jcr) +{ + if (jcr->wstore) { + jcr->wstore->NumConcurrentJobs--; + Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); + ASSERT(jcr->wstore->NumConcurrentJobs >= 0); + } +} diff --git a/bacula/src/version.h b/bacula/src/version.h index df9bb20460..646013a6c1 100644 --- a/bacula/src/version.h +++ b/bacula/src/version.h @@ -4,8 +4,8 @@ #undef VERSION #define VERSION "2.4.1" -#define BDATE "20 June 2008" -#define LSMDATE "20Jun08" +#define BDATE "22 June 2008" +#define LSMDATE "22Jun08" #define PROG_COPYRIGHT "Copyright (C) %d-2008 Free Software Foundation Europe e.V.\n" #define BYEAR "2008" /* year for copyright messages in progs */ diff --git a/bacula/technotes-2.3 b/bacula/technotes-2.3 index 3d92b99976..b61c1a9c4b 100644 --- a/bacula/technotes-2.3 +++ b/bacula/technotes-2.3 @@ -1,6 +1,10 @@ Technical notes on version 2.2 General: +22Jun08 +kes Copy more data when restarting a job so that run + overrides are kept. This should fix bug #1094. +kes Backport updates to jobq.c where possible. 20Jun08 kes Fix bug where SD did not ask operator if the device could not be opened. Reported by Eric.