From: Kern Sibbald Date: Mon, 10 Dec 2007 17:55:40 +0000 (+0000) Subject: Correct a problem where the maximum concurrent storage jobs counter gets out of sync... X-Git-Tag: Release-3.0.0~2169 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=c641313a8a089b0eb390843ab0ac6ea6c7352c0a;hp=c9a7d320e5008fb45b933ef9c1abacabdec0dad2;p=bacula%2Fbacula Correct a problem where the maximum concurrent storage jobs counter gets out of sync during restore jobs causing jobs to 'wait on max Storage jobs'. Fixes bug #1009. git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@6035 91ce42f0-d328-0410-95d8-f526ca767f89 --- diff --git a/bacula/examples/nagios/nagios_plugin_check_bacula.tgz b/bacula/examples/nagios/nagios_plugin_check_bacula.tgz index 229dad743b..e800236da5 100644 Binary files a/bacula/examples/nagios/nagios_plugin_check_bacula.tgz and b/bacula/examples/nagios/nagios_plugin_check_bacula.tgz differ diff --git a/bacula/patches/2.2.6-maxconcurrentjobs.patch b/bacula/patches/2.2.6-maxconcurrentjobs.patch new file mode 100644 index 0000000000..fa8f04a1bb --- /dev/null +++ b/bacula/patches/2.2.6-maxconcurrentjobs.patch @@ -0,0 +1,134 @@ + + This patch corrects a problem where the maximum concurrent storage + jobs counter gets out of sync during restore jobs causing jobs to + "wait on max Storage jobs". This patch fixes bug #1009. + + Apply this patch to 2.2.6 and probably any 2.2.x version with the + following: + + cd + patch -p0 <2.2.6-maxconcurrentjobs.patch + ./configure + make + ... + make install + + +Index: src/dird/jobq.c +=================================================================== +--- src/dird/jobq.c (revision 6019) ++++ src/dird/jobq.c (working copy) +@@ -1,23 +1,4 @@ + /* +- * Bacula job queue routines. +- * +- * This code consists of three queues, the waiting_jobs +- * queue, where jobs are initially queued, the ready_jobs +- * queue, where jobs are placed when all the resources are +- * allocated and they can immediately be run, and the +- * running queue where jobs are placed when they are +- * running. +- * +- * Kern Sibbald, July MMIII +- * +- * Version $Id$ +- * +- * This code was adapted from the Bacula workq, which was +- * adapted from "Programming with POSIX Threads", by +- * David R. Butenhof +- * +- */ +-/* + Bacula® - The Network Backup Solution + + Copyright (C) 2003-2007 Free Software Foundation Europe e.V. +@@ -44,6 +25,25 @@ + (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich, + Switzerland, email:ftf@fsfeurope.org. + */ ++/* ++ * Bacula job queue routines. ++ * ++ * This code consists of three queues, the waiting_jobs ++ * queue, where jobs are initially queued, the ready_jobs ++ * queue, where jobs are placed when all the resources are ++ * allocated and they can immediately be run, and the ++ * running queue where jobs are placed when they are ++ * running. ++ * ++ * Kern Sibbald, July MMIII ++ * ++ * Version $Id$ ++ * ++ * This code was adapted from the Bacula workq, which was ++ * adapted from "Programming with POSIX Threads", by ++ * David R. Butenhof ++ * ++ */ + + #include "bacula.h" + #include "dird.h" +@@ -453,6 +453,7 @@ + } + } + jq->running_jobs->append(je); ++// set_jcr_in_tsd(jcr); + Dmsg1(2300, "Took jobid=%d from ready and appended to run\n", jcr->JobId); + + /* Release job queue lock */ +@@ -682,14 +683,13 @@ + jcr->acquired_resource_locks = false; + if (jcr->rstore) { + Dmsg1(200, "Rstore=%s\n", jcr->rstore->name()); +- if (jcr->rstore->NumConcurrentJobs == 0 && +- jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { +- /* Simple case, first job */ ++ /* ++ * Let only one Restore/Verify job run at a time regardless ++ * of MaxConcurrentjobs. ++ */ ++ if (jcr->rstore->NumConcurrentJobs == 0) { + jcr->rstore->NumConcurrentJobs = 1; + Dmsg0(200, "Set rncj=1\n"); +- } else if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { +- jcr->rstore->NumConcurrentJobs++; +- Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs); + } else { + Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs); + set_jcr_job_status(jcr, JS_WaitStoreRes); +@@ -700,7 +700,7 @@ + if (jcr->wstore) { + Dmsg1(200, "Wstore=%s\n", jcr->wstore->name()); + if (jcr->rstore == jcr->wstore) { /* deadlock */ +- jcr->rstore->NumConcurrentJobs--; /* back out rstore */ ++ jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */ + Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n" + " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), + jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source); +@@ -716,7 +716,7 @@ + jcr->wstore->NumConcurrentJobs++; + Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs); + } else if (jcr->rstore) { +- jcr->rstore->NumConcurrentJobs--; /* back out rstore */ ++ jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */ + Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs); + skip_this_jcr = true; + } else { +@@ -738,7 +738,7 @@ + Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); + } + if (jcr->rstore) { +- jcr->rstore->NumConcurrentJobs--; ++ jcr->rstore->NumConcurrentJobs = 0; + Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); + } + set_jcr_job_status(jcr, JS_WaitClientRes); +@@ -753,7 +753,7 @@ + Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); + } + if (jcr->rstore) { +- jcr->rstore->NumConcurrentJobs--; ++ jcr->rstore->NumConcurrentJobs = 0; + Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); + } + jcr->client->NumConcurrentJobs--; diff --git a/bacula/src/cats/create_postgresql_database.in b/bacula/src/cats/create_postgresql_database.in index 5b82375c88..0debbc67ef 100644 --- a/bacula/src/cats/create_postgresql_database.in +++ b/bacula/src/cats/create_postgresql_database.in @@ -9,9 +9,15 @@ db_name=@db_name@ # use SQL_ASCII to be able to put any filename into # the database even those created with unusual character sets ENCODING="ENCODING 'SQL_ASCII'" + # use UTF8 if you are using standard Unix/Linux LANG specifications # that use UTF8 -- this is normally the default and *should* be -# your standard. Bacula consoles work correctly *only* with UTF8. +# your standard. Bacula works correctly *only* with correct UTF8. +# +# Note, with this encoding, if you have any "weird" filenames on +# your system (names generated from Win32 or Mac OS), you may +# get Bacula batch insert failures. +# #ENCODING="ENCODING 'UTF8'" diff --git a/bacula/src/dird/jobq.c b/bacula/src/dird/jobq.c index f3a9e794bf..c4a4c8cb8b 100644 --- a/bacula/src/dird/jobq.c +++ b/bacula/src/dird/jobq.c @@ -683,14 +683,13 @@ static bool acquire_resources(JCR *jcr) jcr->acquired_resource_locks = false; if (jcr->rstore) { Dmsg1(200, "Rstore=%s\n", jcr->rstore->name()); - if (jcr->rstore->NumConcurrentJobs == 0 && - jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { - /* Simple case, first job */ + /* + * Let only one Restore/Verify job run at a time regardless + * of MaxConcurrentjobs. + */ + if (jcr->rstore->NumConcurrentJobs == 0) { jcr->rstore->NumConcurrentJobs = 1; Dmsg0(200, "Set rncj=1\n"); - } else if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { - jcr->rstore->NumConcurrentJobs++; - Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs); } else { Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs); set_jcr_job_status(jcr, JS_WaitStoreRes); @@ -701,7 +700,7 @@ static bool acquire_resources(JCR *jcr) if (jcr->wstore) { Dmsg1(200, "Wstore=%s\n", jcr->wstore->name()); if (jcr->rstore == jcr->wstore) { /* deadlock */ - jcr->rstore->NumConcurrentJobs--; /* back out rstore */ + jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */ Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n" " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source); @@ -717,7 +716,7 @@ static bool acquire_resources(JCR *jcr) jcr->wstore->NumConcurrentJobs++; Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs); } else if (jcr->rstore) { - jcr->rstore->NumConcurrentJobs--; /* back out rstore */ + jcr->rstore->NumConcurrentJobs = 0; /* back out rstore */ Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs); skip_this_jcr = true; } else { @@ -739,7 +738,7 @@ static bool acquire_resources(JCR *jcr) Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); } if (jcr->rstore) { - jcr->rstore->NumConcurrentJobs--; + jcr->rstore->NumConcurrentJobs = 0; Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); } set_jcr_job_status(jcr, JS_WaitClientRes); @@ -754,7 +753,7 @@ static bool acquire_resources(JCR *jcr) Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs); } if (jcr->rstore) { - jcr->rstore->NumConcurrentJobs--; + jcr->rstore->NumConcurrentJobs = 0; Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); } jcr->client->NumConcurrentJobs--; diff --git a/bacula/src/stored/reserve.c b/bacula/src/stored/reserve.c index b98a656d99..4e44a524f0 100644 --- a/bacula/src/stored/reserve.c +++ b/bacula/src/stored/reserve.c @@ -331,9 +331,9 @@ VOLRES *reserve_volume(DCR *dcr, const char *VolumeName) goto get_out; /* Volume already on this device */ } else { Dmsg2(dbglvl, "reserve_vol free vol=%s at %p\n", vol->vol_name, vol->vol_name); - debug_list_volumes("reserve_vol free"); vol_list->remove(vol); free_vol_item(vol); + debug_list_volumes("reserve_vol free"); } } @@ -453,7 +453,7 @@ bool volume_unused(DCR *dcr) } if (dev->is_busy()) { - Dmsg1(dbglvl, "vol_unused: no vol on %s\n", dev->print_name()); + Dmsg1(dbglvl, "vol_unused: busy on %s\n", dev->print_name()); debug_list_volumes("dev busy cannot unreserve_volume"); return false; } diff --git a/bacula/technotes-2.3 b/bacula/technotes-2.3 index 48b273800b..86a6c54da1 100644 --- a/bacula/technotes-2.3 +++ b/bacula/technotes-2.3 @@ -1,6 +1,10 @@ Technical notes on version 2.3 General: +10Dec07 +kes This patch corrects a problem where the maximum concurrent storage + jobs counter gets out of sync during restore jobs causing jobs to + 'wait on max Storage jobs'. This patch fixes bug #1009. 03Dec07 kes This patch fixes bcopy so that it produces correct Volumes. It fixes bug #1022. @@ -11,7 +15,7 @@ kes This patch prevents the 'status dir' command from trying to use a scratch volume and possibly moving it from one pool to another. This patch fixes bug #1019. 01Dec07 -kes Add new include to cats/postgresql.c suggested by Marc Cousins so +kes Add new include to postgresql.c suggested by Marc Cousins so that it compiles correctly with pgre version 8.3. 30Nov07 kes Fix --archivedir addition to configure. Replace it with