X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=bacula%2Fsrc%2Fdird%2Fjobq.c;h=aec1bd5724da97fd2a4682f623acc7cecbaf9b80;hb=c5f4a68937116a768e6d85bc576a5fd41a56e93a;hp=7c0d86485f675d0364946e9960db863af5387156;hpb=043dfdbe5bb4cec577c9000959f051b83c270b08;p=bacula%2Fbacula diff --git a/bacula/src/dird/jobq.c b/bacula/src/dird/jobq.c old mode 100755 new mode 100644 index 7c0d86485f..aec1bd5724 --- a/bacula/src/dird/jobq.c +++ b/bacula/src/dird/jobq.c @@ -1,3 +1,30 @@ +/* + Bacula® - The Network Backup Solution + + Copyright (C) 2003-2010 Free Software Foundation Europe e.V. + + The main author of Bacula is Kern Sibbald, with contributions from + many others, a complete list can be found in the file AUTHORS. + This program is Free Software; you can redistribute it and/or + modify it under the terms of version three of the GNU Affero General Public + License as published by the Free Software Foundation and included + in the file LICENSE. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + + Bacula® is a registered trademark of Kern Sibbald. + The licensor of Bacula is the Free Software Foundation Europe + (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich, + Switzerland, email:ftf@fsfeurope.org. +*/ /* * Bacula job queue routines. * @@ -10,32 +37,12 @@ * * Kern Sibbald, July MMIII * - * Version $Id$ * * This code was adapted from the Bacula workq, which was * adapted from "Programming with POSIX Threads", by * David R. Butenhof * */ -/* - Copyright (C) 2003-2005 Kern Sibbald - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - MA 02111-1307, USA. - - */ #include "bacula.h" #include "dird.h" @@ -48,8 +55,8 @@ extern "C" void *sched_wait(void *arg); static int start_server(jobq_t *jq); static bool acquire_resources(JCR *jcr); - - +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je); +static void dec_write_store(JCR *jcr); /* * Initialize a job queue @@ -64,7 +71,7 @@ int jobq_init(jobq_t *jq, int threads, void *(*engine)(void *arg)) if ((stat = pthread_attr_init(&jq->attr)) != 0) { berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_attr_init: ERR=%s\n"), be.strerror(stat)); + Jmsg1(NULL, M_ERROR, 0, _("pthread_attr_init: ERR=%s\n"), be.bstrerror(stat)); return stat; } if ((stat = pthread_attr_setdetachstate(&jq->attr, PTHREAD_CREATE_DETACHED)) != 0) { @@ -73,13 +80,13 @@ int jobq_init(jobq_t *jq, int threads, void *(*engine)(void *arg)) } if ((stat = pthread_mutex_init(&jq->mutex, NULL)) != 0) { berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_init: ERR=%s\n"), be.strerror(stat)); + Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_init: ERR=%s\n"), be.bstrerror(stat)); pthread_attr_destroy(&jq->attr); return stat; } if ((stat = pthread_cond_init(&jq->work, NULL)) != 0) { berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_init: ERR=%s\n"), be.strerror(stat)); + Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_init: ERR=%s\n"), be.bstrerror(stat)); pthread_mutex_destroy(&jq->mutex); pthread_attr_destroy(&jq->attr); return stat; @@ -110,11 +117,7 @@ int jobq_destroy(jobq_t *jq) if (jq->valid != JOBQ_VALID) { return EINVAL; } - if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) { - berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.strerror(stat)); - return stat; - } + P(jq->mutex); jq->valid = 0; /* prevent any more operations */ /* @@ -125,25 +128,21 @@ int jobq_destroy(jobq_t *jq) if (jq->idle_workers) { if ((stat = pthread_cond_broadcast(&jq->work)) != 0) { berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_broadcast: ERR=%s\n"), be.strerror(stat)); - pthread_mutex_unlock(&jq->mutex); + Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_broadcast: ERR=%s\n"), be.bstrerror(stat)); + V(jq->mutex); return stat; } } while (jq->num_workers > 0) { if ((stat = pthread_cond_wait(&jq->work, &jq->mutex)) != 0) { berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_wait: ERR=%s\n"), be.strerror(stat)); - pthread_mutex_unlock(&jq->mutex); + Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_wait: ERR=%s\n"), be.bstrerror(stat)); + V(jq->mutex); return stat; } } } - if ((stat = pthread_mutex_unlock(&jq->mutex)) != 0) { - berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_unlock: ERR=%s\n"), be.strerror(stat)); - return stat; - } + V(jq->mutex); stat = pthread_mutex_destroy(&jq->mutex); stat1 = pthread_cond_destroy(&jq->work); stat2 = pthread_attr_destroy(&jq->attr); @@ -171,6 +170,7 @@ void *sched_wait(void *arg) JCR *jcr = ((wait_pkt *)arg)->jcr; jobq_t *jq = ((wait_pkt *)arg)->jq; + set_jcr_in_tsd(INVALID_JCR); Dmsg0(2300, "Enter sched_wait.\n"); free(arg); time_t wtime = jcr->sched_time - time(NULL); @@ -182,7 +182,8 @@ void *sched_wait(void *arg) } /* Check every 30 seconds if canceled */ while (wtime > 0) { - Dmsg2(2300, "Waiting on sched time, jobid=%d secs=%d\n", jcr->JobId, wtime); + Dmsg3(2300, "Waiting on sched time, jobid=%d secs=%d use=%d\n", + jcr->JobId, wtime, jcr->use_count()); if (wtime > 30) { wtime = 30; } @@ -192,9 +193,8 @@ void *sched_wait(void *arg) } wtime = jcr->sched_time - time(NULL); } - P(jcr->mutex); /* lock jcr */ + Dmsg1(200, "resched use=%d\n", jcr->use_count()); jobq_add(jq, jcr); - V(jcr->mutex); free_jcr(jcr); /* we are done with jcr */ Dmsg0(2300, "Exit sched_wait\n"); return NULL; @@ -203,9 +203,6 @@ void *sched_wait(void *arg) /* * Add a job to the queue * jq is a queue that was created with jobq_init - * - * On entry jcr->mutex must be locked. - * */ int jobq_add(jobq_t *jq, JCR *jcr) { @@ -216,14 +213,24 @@ int jobq_add(jobq_t *jq, JCR *jcr) pthread_t id; wait_pkt *sched_pkt; - Dmsg3(2300, "jobq_add jobid=%d jcr=0x%x use_count=%d\n", jcr->JobId, jcr, jcr->use_count); + if (!jcr->term_wait_inited) { + /* Initialize termination condition variable */ + if ((stat = pthread_cond_init(&jcr->term_wait, NULL)) != 0) { + berrno be; + Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), be.bstrerror(stat)); + return stat; + } + jcr->term_wait_inited = true; + } + + Dmsg3(2300, "jobq_add jobid=%d jcr=0x%x use_count=%d\n", jcr->JobId, jcr, jcr->use_count()); if (jq->valid != JOBQ_VALID) { Jmsg0(jcr, M_ERROR, 0, "Jobq_add queue not initialized.\n"); return EINVAL; } - jcr->use_count++; /* mark jcr in use by us */ - Dmsg3(2300, "jobq_add jobid=%d jcr=0x%x use_count=%d\n", jcr->JobId, jcr, jcr->use_count); + jcr->inc_use_count(); /* mark jcr in use by us */ + Dmsg3(2300, "jobq_add jobid=%d jcr=0x%x use_count=%d\n", jcr->JobId, jcr, jcr->use_count()); if (!job_canceled(jcr) && wtime > 0) { set_thread_concurrency(jq->max_workers + 2); sched_pkt = (wait_pkt *)malloc(sizeof(wait_pkt)); @@ -232,24 +239,21 @@ int jobq_add(jobq_t *jq, JCR *jcr) stat = pthread_create(&id, &jq->attr, sched_wait, (void *)sched_pkt); if (stat != 0) { /* thread not created */ berrno be; - Jmsg1(jcr, M_ERROR, 0, _("pthread_thread_create: ERR=%s\n"), be.strerror(stat)); + Jmsg1(jcr, M_ERROR, 0, _("pthread_thread_create: ERR=%s\n"), be.bstrerror(stat)); } return stat; } - if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) { - berrno be; - Jmsg1(jcr, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.strerror(stat)); - jcr->use_count--; /* release jcr */ - return stat; - } + P(jq->mutex); if ((item = (jobq_item_t *)malloc(sizeof(jobq_item_t))) == NULL) { - jcr->use_count--; /* release jcr */ + free_jcr(jcr); /* release jcr */ return ENOMEM; } item->jcr = jcr; + /* While waiting in a queue this job is not attached to a thread */ + set_jcr_in_tsd(INVALID_JCR); if (job_canceled(jcr)) { /* Add job to ready queue so that it is canceled quickly */ jq->ready_jobs->prepend(item); @@ -277,7 +281,7 @@ int jobq_add(jobq_t *jq, JCR *jcr) /* Ensure that at least one server looks at the queue. */ stat = start_server(jq); - pthread_mutex_unlock(&jq->mutex); + V(jq->mutex); Dmsg0(2300, "Return jobq_add\n"); return stat; } @@ -302,12 +306,7 @@ int jobq_remove(jobq_t *jq, JCR *jcr) return EINVAL; } - if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) { - berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.strerror(stat)); - return stat; - } - + P(jq->mutex); foreach_dlist(item, jq->waiting_jobs) { if (jcr == item->jcr) { found = true; @@ -315,7 +314,7 @@ int jobq_remove(jobq_t *jq, JCR *jcr) } } if (!found) { - pthread_mutex_unlock(&jq->mutex); + V(jq->mutex); Dmsg2(2300, "jobq_remove jobid=%d jcr=0x%x not in wait queue\n", jcr->JobId, jcr); return EINVAL; } @@ -327,7 +326,7 @@ int jobq_remove(jobq_t *jq, JCR *jcr) stat = start_server(jq); - pthread_mutex_unlock(&jq->mutex); + V(jq->mutex); Dmsg0(2300, "Return jobq_remove\n"); return stat; } @@ -342,24 +341,26 @@ static int start_server(jobq_t *jq) pthread_t id; /* - * if any threads are idle, wake one -- - * actually we do a broadcast because on /lib/tls + * if any threads are idle, wake one. + * Actually we do a broadcast because on /lib/tls * these signals seem to get lost from time to time. */ if (jq->idle_workers > 0) { Dmsg0(2300, "Signal worker to wake up\n"); if ((stat = pthread_cond_broadcast(&jq->work)) != 0) { berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_signal: ERR=%s\n"), be.strerror(stat)); + Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_signal: ERR=%s\n"), be.bstrerror(stat)); return stat; } } else if (jq->num_workers < jq->max_workers) { Dmsg0(2300, "Create worker thread\n"); /* No idle threads so create a new one */ set_thread_concurrency(jq->max_workers + 1); + jq->num_workers++; if ((stat = pthread_create(&id, &jq->attr, jobq_server, (void *)jq)) != 0) { berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_create: ERR=%s\n"), be.strerror(stat)); + jq->num_workers--; + Jmsg1(NULL, M_ERROR, 0, _("pthread_create: ERR=%s\n"), be.bstrerror(stat)); return stat; } } @@ -382,13 +383,9 @@ void *jobq_server(void *arg) bool timedout = false; bool work = true; + set_jcr_in_tsd(INVALID_JCR); Dmsg0(2300, "Start jobq_server\n"); - if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) { - berrno be; - Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.strerror(stat)); - return NULL; - } - jq->num_workers++; + P(jq->mutex); for (;;) { struct timeval tv; @@ -414,7 +411,7 @@ void *jobq_server(void *arg) /* This shouldn't happen */ Dmsg0(2300, "This shouldn't happen\n"); jq->num_workers--; - pthread_mutex_unlock(&jq->mutex); + V(jq->mutex); return NULL; } break; @@ -433,21 +430,31 @@ void *jobq_server(void *arg) Dmsg0(2300, "ready queue not empty start server\n"); if (start_server(jq) != 0) { jq->num_workers--; - pthread_mutex_unlock(&jq->mutex); + V(jq->mutex); return NULL; } } jq->running_jobs->append(je); + + /* Attach jcr to this thread while we run the job */ + jcr->set_killable(true); + set_jcr_in_tsd(jcr); Dmsg1(2300, "Took jobid=%d from ready and appended to run\n", jcr->JobId); /* Release job queue lock */ V(jq->mutex); /* Call user's routine here */ - Dmsg1(2300, "Calling user engine for jobid=%d\n", jcr->JobId); + Dmsg3(2300, "Calling user engine for jobid=%d use=%d stat=%c\n", jcr->JobId, + jcr->use_count(), jcr->JobStatus); jq->engine(je->jcr); - Dmsg1(2300, "Back from user engine jobid=%d.\n", jcr->JobId); + /* Job finished detach from thread */ + remove_jcr_from_tsd(je->jcr); + je->jcr->set_killable(false); + + Dmsg2(2300, "Back from user engine jobid=%d use=%d.\n", jcr->JobId, + jcr->use_count()); /* Reacquire job queue lock */ P(jq->mutex); @@ -459,71 +466,19 @@ void *jobq_server(void *arg) * put into the ready queue. */ if (jcr->acquired_resource_locks) { - jcr->store->NumConcurrentJobs--; + dec_read_store(jcr); + dec_write_store(jcr); jcr->client->NumConcurrentJobs--; jcr->job->NumConcurrentJobs--; + jcr->acquired_resource_locks = false; } - /* - * Reschedule the job if necessary and requested - */ - if (jcr->job->RescheduleOnError && - jcr->JobStatus != JS_Terminated && - jcr->JobStatus != JS_Canceled && - jcr->job->RescheduleTimes > 0 && - jcr->JobType == JT_BACKUP && - jcr->reschedule_count < jcr->job->RescheduleTimes) { - char dt[50]; - - /* - * Reschedule this job by cleaning it up, but - * reuse the same JobId if possible. - */ - jcr->reschedule_count++; - jcr->sched_time = time(NULL) + jcr->job->RescheduleInterval; - Dmsg2(2300, "Rescheduled Job %s to re-run in %d seconds.\n", jcr->Job, - (int)jcr->job->RescheduleInterval); - bstrftime(dt, sizeof(dt), time(NULL)); - Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds.\n"), - jcr->Job, dt, (int)jcr->job->RescheduleInterval); - dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */ - jcr->JobStatus = JS_WaitStartTime; - jcr->SDJobStatus = 0; - if (jcr->JobBytes == 0) { - Dmsg1(2300, "Requeue job=%d\n", jcr->JobId); - jcr->JobStatus = JS_WaitStartTime; - V(jq->mutex); - jobq_add(jq, jcr); /* queue the job to run again */ - P(jq->mutex); - free(je); /* free the job entry */ - continue; /* look for another job to run */ - } - /* - * Something was actually backed up, so we cannot reuse - * the old JobId or there will be database record - * conflicts. We now create a new job, copying the - * appropriate fields. - */ - JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr); - set_jcr_defaults(njcr, jcr->job); - njcr->reschedule_count = jcr->reschedule_count; - njcr->JobLevel = jcr->JobLevel; - njcr->JobStatus = jcr->JobStatus; - copy_storage(njcr, jcr); - njcr->messages = jcr->messages; - Dmsg0(2300, "Call to run new job\n"); - V(jq->mutex); - run_job(njcr); /* This creates a "new" job */ - free_jcr(njcr); /* release "new" jcr */ - P(jq->mutex); - Dmsg0(2300, "Back from running new job.\n"); + if (reschedule_job(jcr, jq, je)) { + continue; /* go look for more work */ } + /* Clean up and release old jcr */ - if (jcr->db) { - db_close_database(jcr, jcr->db); - jcr->db = NULL; - } - Dmsg2(2300, "====== Termination job=%d use_cnt=%d\n", jcr->JobId, jcr->use_count); + Dmsg2(2300, "====== Termination job=%d use_cnt=%d\n", jcr->JobId, jcr->use_count()); jcr->SDJobStatus = 0; V(jq->mutex); /* release internal lock */ free_jcr(jcr); @@ -537,11 +492,26 @@ void *jobq_server(void *arg) Dmsg0(2300, "Done check ready, now check wait queue.\n"); if (!jq->waiting_jobs->empty() && !jq->quit) { int Priority; + bool running_allow_mix = false; je = (jobq_item_t *)jq->waiting_jobs->first(); jobq_item_t *re = (jobq_item_t *)jq->running_jobs->first(); if (re) { Priority = re->jcr->JobPriority; - Dmsg2(2300, "JobId %d is running. Look for pri=%d\n", re->jcr->JobId, Priority); + Dmsg2(2300, "JobId %d is running. Look for pri=%d\n", + re->jcr->JobId, Priority); + running_allow_mix = true; + for ( ; re; ) { + Dmsg2(2300, "JobId %d is also running with %s\n", + re->jcr->JobId, + re->jcr->job->allow_mixed_priority ? "mix" : "no mix"); + if (!re->jcr->job->allow_mixed_priority) { + running_allow_mix = false; + break; + } + re = (jobq_item_t *)jq->running_jobs->next(re); + } + Dmsg1(2300, "The running job(s) %s mixing priorities.\n", + running_allow_mix ? "allow" : "don't allow"); } else { Priority = je->jcr->JobPriority; Dmsg1(2300, "No job running. Look for Job pri=%d\n", Priority); @@ -555,24 +525,32 @@ void *jobq_server(void *arg) JCR *jcr = je->jcr; jobq_item_t *jn = (jobq_item_t *)jq->waiting_jobs->next(je); - Dmsg3(2300, "Examining Job=%d JobPri=%d want Pri=%d\n", - jcr->JobId, jcr->JobPriority, Priority); + Dmsg4(2300, "Examining Job=%d JobPri=%d want Pri=%d (%s)\n", + jcr->JobId, jcr->JobPriority, Priority, + jcr->job->allow_mixed_priority ? "mix" : "no mix"); /* Take only jobs of correct Priority */ - if (jcr->JobPriority != Priority) { + if (!(jcr->JobPriority == Priority + || (jcr->JobPriority < Priority && + jcr->job->allow_mixed_priority && running_allow_mix))) { set_jcr_job_status(jcr, JS_WaitPriority); break; } if (!acquire_resources(jcr)) { - je = jn; /* point to next waiting job */ - continue; + /* If resource conflict, job is canceled */ + if (!job_canceled(jcr)) { + je = jn; /* point to next waiting job */ + continue; + } } - /* Got all locks, now remove it from wait queue and append it - * to the ready queue + /* + * Got all locks, now remove it from wait queue and append it + * to the ready queue. Note, we may also get here if the + * job was canceled. Once it is "run", it will quickly + * terminate. */ - jcr->acquired_resource_locks = true; jq->waiting_jobs->remove(je); jq->ready_jobs->append(je); Dmsg1(2300, "moved JobId=%d from wait to ready queue\n", je->jcr->JobId); @@ -629,6 +607,94 @@ void *jobq_server(void *arg) return NULL; } +/* + * Returns true if cleanup done and we should look for more work + */ +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je) +{ + /* + * Reschedule the job if necessary and requested + */ + if (jcr->job->RescheduleOnError && + jcr->JobStatus != JS_Terminated && + jcr->JobStatus != JS_Canceled && + jcr->getJobType() == JT_BACKUP && + (jcr->job->RescheduleTimes == 0 || + jcr->reschedule_count < jcr->job->RescheduleTimes)) { + char dt[50], dt2[50]; + + /* + * Reschedule this job by cleaning it up, but + * reuse the same JobId if possible. + */ + time_t now = time(NULL); + jcr->reschedule_count++; + jcr->sched_time = now + jcr->job->RescheduleInterval; + bstrftime(dt, sizeof(dt), now); + bstrftime(dt2, sizeof(dt2), jcr->sched_time); + Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job, + (int)jcr->job->RescheduleInterval, now, jcr->sched_time); + Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"), + jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2); + dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */ + jcr->JobStatus = -1; + set_jcr_job_status(jcr, JS_WaitStartTime); + jcr->SDJobStatus = 0; + if (!allow_duplicate_job(jcr)) { + return false; + } + if (jcr->JobBytes == 0) { + Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count()); + V(jq->mutex); + jobq_add(jq, jcr); /* queue the job to run again */ + P(jq->mutex); + free_jcr(jcr); /* release jcr */ + free(je); /* free the job entry */ + return true; /* we already cleaned up */ + } + /* + * Something was actually backed up, so we cannot reuse + * the old JobId or there will be database record + * conflicts. We now create a new job, copying the + * appropriate fields. + */ + JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr); + set_jcr_defaults(njcr, jcr->job); + njcr->reschedule_count = jcr->reschedule_count; + njcr->sched_time = jcr->sched_time; + njcr->set_JobLevel(jcr->getJobLevel()); + njcr->pool = jcr->pool; + njcr->run_pool_override = jcr->run_pool_override; + njcr->full_pool = jcr->full_pool; + njcr->run_full_pool_override = jcr->run_full_pool_override; + njcr->inc_pool = jcr->inc_pool; + njcr->run_inc_pool_override = jcr->run_inc_pool_override; + njcr->diff_pool = jcr->diff_pool; + njcr->JobStatus = -1; + set_jcr_job_status(njcr, jcr->JobStatus); + if (jcr->rstore) { + copy_rstorage(njcr, jcr->rstorage, _("previous Job")); + } else { + free_rstorage(njcr); + } + if (jcr->wstore) { + copy_wstorage(njcr, jcr->wstorage, _("previous Job")); + } else { + free_wstorage(njcr); + } + njcr->messages = jcr->messages; + njcr->spool_data = jcr->spool_data; + njcr->write_part_after_job = jcr->write_part_after_job; + Dmsg0(2300, "Call to run new job\n"); + V(jq->mutex); + run_job(njcr); /* This creates a "new" job */ + free_jcr(njcr); /* release "new" jcr */ + P(jq->mutex); + Dmsg0(2300, "Back from running new job.\n"); + } + return false; +} + /* * See if we can acquire all the necessary resources for the job (JCR) * @@ -639,26 +705,44 @@ static bool acquire_resources(JCR *jcr) { bool skip_this_jcr = false; - if (jcr->JobType == JT_RESTORE || jcr->JobType == JT_VERIFY) { - /* - * Let only one Restore/verify job run at a time regardless - * of MaxConcurrentJobs. - */ - if (jcr->store->NumConcurrentJobs == 0) { - jcr->store->NumConcurrentJobs = 1; - } else { + jcr->acquired_resource_locks = false; +/* + * Turning this code off is likely to cause some deadlocks, + * but we do not really have enough information here to + * know if this is really a deadlock (it may be a dual drive + * autochanger), and in principle, the SD reservation system + * should detect these deadlocks, so push the work off on it. + */ +#ifdef xxx + if (jcr->rstore && jcr->rstore == jcr->wstore) { /* possible deadlock */ + Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n" + " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), + jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source); + set_jcr_job_status(jcr, JS_Canceled); + return false; + } +#endif + if (jcr->rstore) { + Dmsg1(200, "Rstore=%s\n", jcr->rstore->name()); + if (!inc_read_store(jcr)) { + Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs); set_jcr_job_status(jcr, JS_WaitStoreRes); return false; } - /* We are not doing a Restore or Verify */ - } else if (jcr->store->NumConcurrentJobs == 0 && - jcr->store->NumConcurrentJobs < jcr->store->MaxConcurrentJobs) { - /* Simple case, first job */ - jcr->store->NumConcurrentJobs = 1; - } else if (jcr->store->NumConcurrentJobs < jcr->store->MaxConcurrentJobs) { - jcr->store->NumConcurrentJobs++; - } else { - skip_this_jcr = true; + } + + if (jcr->wstore) { + Dmsg1(200, "Wstore=%s\n", jcr->wstore->name()); + if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) { + jcr->wstore->NumConcurrentJobs++; + Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs); + } else if (jcr->rstore) { + dec_read_store(jcr); + skip_this_jcr = true; + } else { + Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs); + skip_this_jcr = true; + } } if (skip_this_jcr) { set_jcr_job_status(jcr, JS_WaitStoreRes); @@ -669,7 +753,8 @@ static bool acquire_resources(JCR *jcr) jcr->client->NumConcurrentJobs++; } else { /* Back out previous locks */ - jcr->store->NumConcurrentJobs--; + dec_write_store(jcr); + dec_read_store(jcr); set_jcr_job_status(jcr, JS_WaitClientRes); return false; } @@ -677,12 +762,55 @@ static bool acquire_resources(JCR *jcr) jcr->job->NumConcurrentJobs++; } else { /* Back out previous locks */ - jcr->store->NumConcurrentJobs--; + dec_write_store(jcr); + dec_read_store(jcr); jcr->client->NumConcurrentJobs--; set_jcr_job_status(jcr, JS_WaitJobRes); return false; } - /* Check actual device availability */ - /* ***FIXME****/ + + jcr->acquired_resource_locks = true; return true; } + +static pthread_mutex_t rstore_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* + * Note: inc_read_store() and dec_read_store() are + * called from select_rstore() in src/dird/restore.c + */ +bool inc_read_store(JCR *jcr) +{ + P(rstore_mutex); + if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { + jcr->rstore->NumConcurrentReadJobs++; + jcr->rstore->NumConcurrentJobs++; + Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs); + V(rstore_mutex); + return true; + } + V(rstore_mutex); + return false; +} + +void dec_read_store(JCR *jcr) +{ + if (jcr->rstore) { + P(rstore_mutex); + jcr->rstore->NumConcurrentReadJobs--; /* back out rstore */ + jcr->rstore->NumConcurrentJobs--; /* back out rstore */ + Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); + V(rstore_mutex); + ASSERT(jcr->rstore->NumConcurrentReadJobs >= 0); + ASSERT(jcr->rstore->NumConcurrentJobs >= 0); + } +} + +static void dec_write_store(JCR *jcr) +{ + if (jcr->wstore) { + jcr->wstore->NumConcurrentJobs--; + Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); + ASSERT(jcr->wstore->NumConcurrentJobs >= 0); + } +}