]> git.sur5r.net Git - bacula/bacula/commitdiff
Correct a problem where the maximum concurrent storage jobs counter gets out of sync...
authorKern Sibbald <kern@sibbald.com>
Mon, 10 Dec 2007 17:55:40 +0000 (17:55 +0000)
committerKern Sibbald <kern@sibbald.com>
Mon, 10 Dec 2007 17:55:40 +0000 (17:55 +0000)
git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@6035 91ce42f0-d328-0410-95d8-f526ca767f89

bacula/examples/nagios/nagios_plugin_check_bacula.tgz
bacula/patches/2.2.6-maxconcurrentjobs.patch [new file with mode: 0644]
bacula/src/cats/create_postgresql_database.in
bacula/src/dird/jobq.c
bacula/src/stored/reserve.c
bacula/technotes-2.3

index 229dad743ba5d7fc31220b1643abc7775cd33988..e800236da57ef9903ea7214d1b806e0caf151fc3 100644 (file)
Binary files a/bacula/examples/nagios/nagios_plugin_check_bacula.tgz and b/bacula/examples/nagios/nagios_plugin_check_bacula.tgz differ
diff --git a/bacula/patches/2.2.6-maxconcurrentjobs.patch b/bacula/patches/2.2.6-maxconcurrentjobs.patch
new file mode 100644 (file)
index 0000000..fa8f04a
--- /dev/null
@@ -0,0 +1,134 @@
+
+ This patch corrects a problem where the maximum concurrent storage
+ jobs counter gets out of sync during restore jobs causing jobs to
+ "wait on max Storage jobs".  This patch fixes bug #1009.
+
+ Apply this patch to 2.2.6 and probably any 2.2.x version with the
+ following:
+
+ cd <bacula-source>
+ patch -p0 <2.2.6-maxconcurrentjobs.patch
+ ./configure <your-options>
+ make
+ ...
+ make install
+
+
+Index: src/dird/jobq.c
+===================================================================
+--- src/dird/jobq.c    (revision 6019)
++++ src/dird/jobq.c    (working copy)
+@@ -1,23 +1,4 @@
+ /*
+- * Bacula job queue routines.
+- *
+- *  This code consists of three queues, the waiting_jobs
+- *  queue, where jobs are initially queued, the ready_jobs
+- *  queue, where jobs are placed when all the resources are
+- *  allocated and they can immediately be run, and the
+- *  running queue where jobs are placed when they are
+- *  running.
+- *
+- *  Kern Sibbald, July MMIII
+- *
+- *   Version $Id$
+- *
+- *  This code was adapted from the Bacula workq, which was
+- *    adapted from "Programming with POSIX Threads", by
+- *    David R. Butenhof
+- *
+- */
+-/*
+    Bacula® - The Network Backup Solution
+    Copyright (C) 2003-2007 Free Software Foundation Europe e.V.
+@@ -44,6 +25,25 @@
+    (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
+    Switzerland, email:ftf@fsfeurope.org.
+ */
++/*
++ * Bacula job queue routines.
++ *
++ *  This code consists of three queues, the waiting_jobs
++ *  queue, where jobs are initially queued, the ready_jobs
++ *  queue, where jobs are placed when all the resources are
++ *  allocated and they can immediately be run, and the
++ *  running queue where jobs are placed when they are
++ *  running.
++ *
++ *  Kern Sibbald, July MMIII
++ *
++ *   Version $Id$
++ *
++ *  This code was adapted from the Bacula workq, which was
++ *    adapted from "Programming with POSIX Threads", by
++ *    David R. Butenhof
++ *
++ */
+ #include "bacula.h"
+ #include "dird.h"
+@@ -453,6 +453,7 @@
+             }
+          }
+          jq->running_jobs->append(je);
++//       set_jcr_in_tsd(jcr);
+          Dmsg1(2300, "Took jobid=%d from ready and appended to run\n", jcr->JobId);
+          /* Release job queue lock */
+@@ -682,14 +683,13 @@
+    jcr->acquired_resource_locks = false;
+    if (jcr->rstore) {
+       Dmsg1(200, "Rstore=%s\n", jcr->rstore->name());
+-      if (jcr->rstore->NumConcurrentJobs == 0 &&
+-          jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
+-         /* Simple case, first job */
++      /*
++       * Let only one Restore/Verify job run at a time regardless
++       *  of MaxConcurrentjobs.
++       */
++      if (jcr->rstore->NumConcurrentJobs == 0) {
+          jcr->rstore->NumConcurrentJobs = 1;
+          Dmsg0(200, "Set rncj=1\n");
+-      } else if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
+-         jcr->rstore->NumConcurrentJobs++;
+-         Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+       } else {
+          Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+          set_jcr_job_status(jcr, JS_WaitStoreRes);
+@@ -700,7 +700,7 @@
+    if (jcr->wstore) {
+       Dmsg1(200, "Wstore=%s\n", jcr->wstore->name());
+       if (jcr->rstore == jcr->wstore) {           /* deadlock */
+-         jcr->rstore->NumConcurrentJobs--;        /* back out rstore */
++         jcr->rstore->NumConcurrentJobs = 0;      /* back out rstore */
+          Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
+             "    Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), 
+             jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
+@@ -716,7 +716,7 @@
+          jcr->wstore->NumConcurrentJobs++;
+          Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+       } else if (jcr->rstore) {
+-         jcr->rstore->NumConcurrentJobs--;        /* back out rstore */
++         jcr->rstore->NumConcurrentJobs = 0;      /* back out rstore */
+          Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+          skip_this_jcr = true;
+       } else {
+@@ -738,7 +738,7 @@
+          Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+       }
+       if (jcr->rstore) {
+-         jcr->rstore->NumConcurrentJobs--;  
++         jcr->rstore->NumConcurrentJobs = 0;
+          Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+       }
+       set_jcr_job_status(jcr, JS_WaitClientRes);
+@@ -753,7 +753,7 @@
+          Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
+       }
+       if (jcr->rstore) {
+-         jcr->rstore->NumConcurrentJobs--;
++         jcr->rstore->NumConcurrentJobs = 0;
+          Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
+       }
+       jcr->client->NumConcurrentJobs--;
index 5b82375c887c867db7524fc5576be9d1186fae37..0debbc67efb6aa4dc1264fd6a25bbff85307b694 100644 (file)
@@ -9,9 +9,15 @@ db_name=@db_name@
 # use SQL_ASCII to be able to put any filename into
 #  the database even those created with unusual character sets
 ENCODING="ENCODING 'SQL_ASCII'"
 # use UTF8 if you are using standard Unix/Linux LANG specifications
 #  that use UTF8 -- this is normally the default and *should* be
-#  your standard.  Bacula consoles work correctly *only* with UTF8.
+#  your standard.  Bacula works correctly *only* with correct UTF8.
+#
+#  Note, with this encoding, if you have any "weird" filenames on
+#  your system (names generated from Win32 or Mac OS), you may
+#  get Bacula batch insert failures.
+#
 #ENCODING="ENCODING 'UTF8'"
      
 
index f3a9e794bf962740167468abb69227e7102f9a7d..c4a4c8cb8b53b922c6a3f87ea20759d5422a373c 100644 (file)
@@ -683,14 +683,13 @@ static bool acquire_resources(JCR *jcr)
    jcr->acquired_resource_locks = false;
    if (jcr->rstore) {
       Dmsg1(200, "Rstore=%s\n", jcr->rstore->name());
-      if (jcr->rstore->NumConcurrentJobs == 0 &&
-          jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
-         /* Simple case, first job */
+      /*
+       * Let only one Restore/Verify job run at a time regardless
+       *  of MaxConcurrentjobs.
+       */
+      if (jcr->rstore->NumConcurrentJobs == 0) {
          jcr->rstore->NumConcurrentJobs = 1;
          Dmsg0(200, "Set rncj=1\n");
-      } else if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
-         jcr->rstore->NumConcurrentJobs++;
-         Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs);
       } else {
          Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs);
          set_jcr_job_status(jcr, JS_WaitStoreRes);
@@ -701,7 +700,7 @@ static bool acquire_resources(JCR *jcr)
    if (jcr->wstore) {
       Dmsg1(200, "Wstore=%s\n", jcr->wstore->name());
       if (jcr->rstore == jcr->wstore) {           /* deadlock */
-         jcr->rstore->NumConcurrentJobs--;        /* back out rstore */
+         jcr->rstore->NumConcurrentJobs = 0;      /* back out rstore */
          Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
             "    Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), 
             jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
@@ -717,7 +716,7 @@ static bool acquire_resources(JCR *jcr)
          jcr->wstore->NumConcurrentJobs++;
          Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs);
       } else if (jcr->rstore) {
-         jcr->rstore->NumConcurrentJobs--;        /* back out rstore */
+         jcr->rstore->NumConcurrentJobs = 0;      /* back out rstore */
          Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
          skip_this_jcr = true;
       } else {
@@ -739,7 +738,7 @@ static bool acquire_resources(JCR *jcr)
          Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
       }
       if (jcr->rstore) {
-         jcr->rstore->NumConcurrentJobs--;  
+         jcr->rstore->NumConcurrentJobs = 0;
          Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
       }
       set_jcr_job_status(jcr, JS_WaitClientRes);
@@ -754,7 +753,7 @@ static bool acquire_resources(JCR *jcr)
          Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
       }
       if (jcr->rstore) {
-         jcr->rstore->NumConcurrentJobs--;
+         jcr->rstore->NumConcurrentJobs = 0;
          Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
       }
       jcr->client->NumConcurrentJobs--;
index b98a656d994f7aa90758d976e8082db2ed0f934c..4e44a524f0ce337f9683af234982af9034d3b4cc 100644 (file)
@@ -331,9 +331,9 @@ VOLRES *reserve_volume(DCR *dcr, const char *VolumeName)
          goto get_out;                  /* Volume already on this device */
       } else {
          Dmsg2(dbglvl, "reserve_vol free vol=%s at %p\n", vol->vol_name, vol->vol_name);
-         debug_list_volumes("reserve_vol free");
          vol_list->remove(vol);
          free_vol_item(vol);
+         debug_list_volumes("reserve_vol free");
       }
    }
 
@@ -453,7 +453,7 @@ bool volume_unused(DCR *dcr)
    }
 
    if (dev->is_busy()) {
-      Dmsg1(dbglvl, "vol_unused: no vol on %s\n", dev->print_name());
+      Dmsg1(dbglvl, "vol_unused: busy on %s\n", dev->print_name());
       debug_list_volumes("dev busy cannot unreserve_volume");
       return false;
    }
index 48b273800bc6de69a237fa2fa5d764de29896332..86a6c54da1d846c72bb4711e73652ff798dc5275 100644 (file)
@@ -1,6 +1,10 @@
               Technical notes on version 2.3
 
 General:
+10Dec07
+kes  This patch corrects a problem where the maximum concurrent storage
+     jobs counter gets out of sync during restore jobs causing jobs to
+     'wait on max Storage jobs'.  This patch fixes bug #1009.
 03Dec07
 kes  This patch fixes bcopy so that it produces correct Volumes.
      It fixes bug #1022.
@@ -11,7 +15,7 @@ kes  This patch prevents the 'status dir' command from trying to use a scratch
      volume and possibly moving it from one pool to another.  This patch fixes
      bug #1019.
 01Dec07
-kes  Add new include to cats/postgresql.c suggested by Marc Cousins so
+kes  Add new include to postgresql.c suggested by Marc Cousins so
      that it compiles correctly with pgre version 8.3.
 30Nov07
 kes  Fix --archivedir addition to configure. Replace it with