--- /dev/null
+
+ This patch corrects a problem where the maximum concurrent storage
+ jobs counter gets out of sync during restore jobs causing jobs to
+ "wait on max Storage jobs". This patch fixes bug #1009.
+
+ Apply this patch to 2.2.6 and probably any 2.2.x version with the
+ following:
+
+ cd <bacula-source>
+ patch -p0 <2.2.6-maxconcurrentjobs.patch
+ ./configure <your-options>
+ make
+ ...
+ make install
+
+
+Index: src/dird/jobq.c
+===================================================================
+--- src/dird/jobq.c (revision 6019)
++++ src/dird/jobq.c (working copy)
+@@ -1,23 +1,4 @@
+ /*
+- * Bacula job queue routines.
+- *
+- * This code consists of three queues, the waiting_jobs
+- * queue, where jobs are initially queued, the ready_jobs
+- * queue, where jobs are placed when all the resources are
+- * allocated and they can immediately be run, and the
+- * running queue where jobs are placed when they are
+- * running.
+- *
+- * Kern Sibbald, July MMIII
+- *
+- * Version $Id$
+- *
+- * This code was adapted from the Bacula workq, which was
+- * adapted from "Programming with POSIX Threads", by
+- * David R. Butenhof
+- *
+- */
+-/*
+ Bacula® - The Network Backup Solution
+
+ Copyright (C) 2003-2007 Free Software Foundation Europe e.V.
+@@ -44,6 +25,25 @@
+ (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
+ Switzerland, email:ftf@fsfeurope.org.
+ */
++/*
++ * Bacula job queue routines.
++ *
++ * This code consists of three queues, the waiting_jobs
++ * queue, where jobs are initially queued, the ready_jobs
++ * queue, where jobs are placed when all the resources are
++ * allocated and they can immediately be run, and the
++ * running queue where jobs are placed when they are
++ * running.
++ *
++ * Kern Sibbald, July MMIII
++ *
++ * Version $Id$
++ *
++ * This code was adapted from the Bacula workq, which was
++ * adapted from "Programming with POSIX Threads", by
++ * David R. Butenhof
++ *
++ */
+
+ #include "bacula.h"
+ #include "dird.h"
+@@ -453,6 +453,7 @@
+ }
+ }
+ jq->running_jobs->append(je);
++// set_jcr_in_tsd(jcr);
+ Dmsg1(2300, "Took jobid=%d from ready and appended to run\n", jcr->JobId);
+
+ /* Release job queue lock */
+@@ -682,14 +683,13 @@
+ jcr->acquired_resource_locks = false;
+ if (jcr->rstore) {
+ Dmsg1(200, "Rstore=%s\n", jcr->rstore->name());
+- if (jcr->rstore->NumConcurrentJobs == 0 &&
+- jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
+- /* Simple case, first job */
++ /*
++ * Let only one Restore/Verify job run at a time regardless
++ * of MaxConcurrentjobs.
++ */
++ if (jcr->rstore->NumConcurrentJobs == 0) {
+ jcr->rstore->NumConcurrentJobs = 1;
+ Dmsg0(200, "Set rncj=1\n");
+- } else if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
+- jcr->rstore->NumConcurrentJobs++;
+- Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+ } else {
+ Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+ set_jcr_job_status(jcr, JS_WaitStoreRes);
+@@ -700,7 +700,7 @@
+ if (jcr->wstore) {
+ Dmsg1(200, "Wstore=%s\n", jcr->wstore->name());
+ if (jcr->rstore == jcr->wstore) { /* deadlock */
+- jcr->rstore->NumConcurrentJobs--; /* back out rstore */
++ jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */
+ Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
+ " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"),
+ jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
+@@ -716,7 +716,7 @@
+ jcr->wstore->NumConcurrentJobs++;
+ Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+ } else if (jcr->rstore) {
+- jcr->rstore->NumConcurrentJobs--; /* back out rstore */
++ jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */
+ Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+ skip_this_jcr = true;
+ } else {
+@@ -738,7 +738,7 @@
+ Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+ }
+ if (jcr->rstore) {
+- jcr->rstore->NumConcurrentJobs--;
++ jcr->rstore->NumConcurrentJobs = 0;
+ Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+ }
+ set_jcr_job_status(jcr, JS_WaitClientRes);
+@@ -753,7 +753,7 @@
+ Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+ }
+ if (jcr->rstore) {
+- jcr->rstore->NumConcurrentJobs--;
++ jcr->rstore->NumConcurrentJobs = 0;
+ Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+ }
+ jcr->client->NumConcurrentJobs--;
jcr->acquired_resource_locks = false;
if (jcr->rstore) {
Dmsg1(200, "Rstore=%s\n", jcr->rstore->name());
- if (jcr->rstore->NumConcurrentJobs == 0 &&
- jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
- /* Simple case, first job */
+ /*
+ * Let only one Restore/Verify job run at a time regardless
+ * of MaxConcurrentjobs.
+ */
+ if (jcr->rstore->NumConcurrentJobs == 0) {
jcr->rstore->NumConcurrentJobs = 1;
Dmsg0(200, "Set rncj=1\n");
- } else if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
- jcr->rstore->NumConcurrentJobs++;
- Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs);
} else {
Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs);
set_jcr_job_status(jcr, JS_WaitStoreRes);
if (jcr->wstore) {
Dmsg1(200, "Wstore=%s\n", jcr->wstore->name());
if (jcr->rstore == jcr->wstore) { /* deadlock */
- jcr->rstore->NumConcurrentJobs--; /* back out rstore */
+ jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */
Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
" Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"),
jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
jcr->wstore->NumConcurrentJobs++;
Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs);
} else if (jcr->rstore) {
- jcr->rstore->NumConcurrentJobs--; /* back out rstore */
+ jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */
Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
skip_this_jcr = true;
} else {
Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
}
if (jcr->rstore) {
- jcr->rstore->NumConcurrentJobs--;
+ jcr->rstore->NumConcurrentJobs = 0;
Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
}
set_jcr_job_status(jcr, JS_WaitClientRes);
Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
}
if (jcr->rstore) {
- jcr->rstore->NumConcurrentJobs--;
+ jcr->rstore->NumConcurrentJobs = 0;
Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
}
jcr->client->NumConcurrentJobs--;