From c59e71d0ad55dcc858001f05f4a79271dfd56016 Mon Sep 17 00:00:00 2001 From: Kern Sibbald Date: Sun, 4 Nov 2007 16:35:24 +0000 Subject: [PATCH] kes Fix bug #942 where lots of emails where generated when the heartbeat interval was low and Bacula wanted a different tape from the one in the drive. git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@5847 91ce42f0-d328-0410-95d8-f526ca767f89 --- bacula/kernstodo | 10 ++ bacula/patches/2.2.5-hb.patch | 186 ++++++++++++++++++++++++++++++++++ bacula/src/stored/wait.c | 60 ++++++----- bacula/technotes-2.3 | 4 + 4 files changed, 235 insertions(+), 25 deletions(-) create mode 100644 bacula/patches/2.2.5-hb.patch diff --git a/bacula/kernstodo b/bacula/kernstodo index 6c2df4124a..67c0e6b3f3 100644 --- a/bacula/kernstodo +++ b/bacula/kernstodo @@ -69,6 +69,16 @@ Professional Needs: - Implement Diff,Inc Retention Periods Priority: +- Auto update of slot: + rufus-dir: ua_run.c:456-10 JobId=10 NewJobId=10 using pool Full priority=10 + 02-Nov 12:58 rufus-dir JobId 10: Start Backup JobId 10, Job=kernsave.2007-11-02_12.58.03 + 02-Nov 12:58 rufus-dir JobId 10: Using Device "DDS-4" + 02-Nov 12:58 rufus-sd JobId 10: Invalid slot=0 defined in catalog for Volume "Vol001" on "DDS-4" (/dev/nst0). Manual load my be required. + 02-Nov 12:58 rufus-sd JobId 10: 3301 Issuing autochanger "loaded? drive 0" command. + 02-Nov 12:58 rufus-sd JobId 10: 3302 Autochanger "loaded? drive 0", result is Slot 2. + 02-Nov 12:58 rufus-sd JobId 10: Wrote label to prelabeled Volume "Vol001" on device "DDS-4" (/dev/nst0) + 02-Nov 12:58 rufus-sd JobId 10: Alert: TapeAlert[7]: Media Life: The tape has reached the end of its useful life. + 02-Nov 12:58 rufus-dir JobId 10: Bacula rufus-dir 2.3.6 (26Oct07): 02-Nov-2007 12:58:51 - Encrypt sd_auth_key = s with director's key = d k[i] = s[i] + (d[i] & 0xF)) & 0xFF + 'A' skip - Decrypt key = k with director's key diff --git a/bacula/patches/2.2.5-hb.patch b/bacula/patches/2.2.5-hb.patch new file mode 100644 index 0000000000..b8d81c879d --- /dev/null +++ b/bacula/patches/2.2.5-hb.patch @@ -0,0 +1,186 @@ + + This patch fixes bug #947 where a large number of emails were generated + because the heartbeat interval was small and the tape in the drive did + not correspond to the one wanted by Bacula. + + Apply the patch to version 2.2.5 (and probably any 2.2.x version) with: + + cd + ./configure + patch -p0 <2.2.5-hb.patch + make + ... + make install + + +Index: src/stored/wait.c +=================================================================== +--- src/stored/wait.c (revision 5814) ++++ src/stored/wait.c (working copy) +@@ -40,9 +40,8 @@ + #include "bacula.h" /* pull in global headers */ + #include "stored.h" /* pull in Storage Deamon headers */ + +-//static bool double_jcr_wait_time(JCR *jcr); ++const int dbglvl = 400; + +- + /* + * Wait for SysOp to mount a tape on a specific device + * +@@ -62,7 +61,7 @@ + JCR *jcr = dcr->jcr; + + dev->dlock(); +- Dmsg1(100, "Enter blocked=%s\n", dev->print_blocked()); ++ Dmsg1(dbglvl, "Enter blocked=%s\n", dev->print_blocked()); + unmounted = is_device_unmounted(dev); + + dev->poll = false; +@@ -84,27 +83,28 @@ + } + + if (!unmounted) { +- Dmsg1(400, "blocked=%s\n", dev->print_blocked()); ++ Dmsg1(dbglvl, "blocked=%s\n", dev->print_blocked()); + dev->dev_prev_blocked = dev->blocked(); + dev->set_blocked(BST_WAITING_FOR_SYSOP); /* indicate waiting for mount */ + } + + for ( ; !job_canceled(jcr); ) { +- time_t now, start; ++ time_t now, start, total_waited; + + gettimeofday(&tv, &tz); + timeout.tv_nsec = tv.tv_usec * 1000; + timeout.tv_sec = tv.tv_sec + add_wait; + +- Dmsg4(400, "I'm going to sleep on device %s. HB=%d wait=%d add_wait=%d\n", +- dev->print_name(), (int)me->heartbeat_interval, dev->wait_sec, add_wait); ++ Dmsg4(dbglvl, "I'm going to sleep on device %s. HB=%d rem_wait=%d add_wait=%d\n", ++ dev->print_name(), (int)me->heartbeat_interval, dev->rem_wait_sec, add_wait); + start = time(NULL); + /* Wait required time */ + stat = pthread_cond_timedwait(&dev->wait_next_vol, &dev->m_mutex, &timeout); +- Dmsg2(400, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, ++ Dmsg2(dbglvl, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, + dev->print_blocked()); + + now = time(NULL); ++ total_waited = now - first_start; + dev->rem_wait_sec -= (now - start); + + /* Note, this always triggers the first time. We want that. */ +@@ -113,7 +113,7 @@ + /* send heartbeats */ + if (jcr->file_bsock) { + jcr->file_bsock->signal(BNET_HEARTBEAT); +- Dmsg0(400, "Send heartbeat to FD.\n"); ++ Dmsg0(dbglvl, "Send heartbeat to FD.\n"); + } + if (jcr->dir_bsock) { + jcr->dir_bsock->signal(BNET_HEARTBEAT); +@@ -131,7 +131,7 @@ + + + if (dev->rem_wait_sec <= 0) { /* on exceeding wait time return */ +- Dmsg0(400, "Exceed wait time.\n"); ++ Dmsg0(dbglvl, "Exceed wait time.\n"); + stat = W_TIMEOUT; + break; + } +@@ -142,8 +142,8 @@ + unmounted = is_device_unmounted(dev); + + if (!unmounted && dev->vol_poll_interval && +- (now - first_start >= dev->vol_poll_interval)) { +- Dmsg1(400, "In wait blocked=%s\n", dev->print_blocked()); ++ (total_waited >= dev->vol_poll_interval)) { ++ Dmsg1(dbglvl, "poll return in wait blocked=%s\n", dev->print_blocked()); + dev->poll = true; /* returning a poll event */ + stat = W_POLL; + break; +@@ -152,6 +152,7 @@ + * Check if user mounted the device while we were waiting + */ + if (dev->blocked() == BST_MOUNT) { /* mount request ? */ ++ Dmsg0(dbglvl, "Mounted return.\n"); + stat = W_MOUNT; + break; + } +@@ -160,30 +161,39 @@ + * If we did not timeout, then some event happened, so + * return to check if state changed. + */ +- if (stat != 0) { ++ if (stat != ETIMEDOUT) { ++ berrno be; ++ Dmsg2(dbglvl, "Wake return. stat=%d. ERR=%s\n", stat, be.bstrerror(stat)); + stat = W_WAKE; /* someone woke us */ + break; + } + + /* + * At this point, we know we woke up because of a timeout, +- * that was due to a heartbeat, so we just update +- * the wait counters and continue. ++ * that was due to a heartbeat, because any other reason would ++ * have caused us to return, so update the wait counters and continue. + */ +- add_wait = dev->wait_sec - (now - start); ++ add_wait = dev->rem_wait_sec; ++ if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { ++ add_wait = me->heartbeat_interval; ++ } ++ /* If the user did not unmount the tape and we are polling, ensure ++ * that we poll at the correct interval. ++ */ ++ if (!unmounted && dev->vol_poll_interval && ++ add_wait > dev->vol_poll_interval - total_waited) { ++ add_wait = dev->vol_poll_interval - total_waited; ++ } + if (add_wait < 0) { + add_wait = 0; + } +- if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { +- add_wait = me->heartbeat_interval; +- } + } + + if (!unmounted) { + dev->set_blocked(dev->dev_prev_blocked); /* restore entry state */ +- Dmsg1(400, "set %s\n", dev->print_blocked()); ++ Dmsg1(dbglvl, "set %s\n", dev->print_blocked()); + } +- Dmsg1(400, "Exit blocked=%s\n", dev->print_blocked()); ++ Dmsg1(dbglvl, "Exit blocked=%s\n", dev->print_blocked()); + dev->dunlock(); + return stat; + } +@@ -209,7 +219,7 @@ + const int max_wait_time = 1 * 60; /* wait 1 minute */ + char ed1[50]; + +- Dmsg0(100, "Enter wait_for_device\n"); ++ Dmsg0(dbglvl, "Enter wait_for_device\n"); + P(device_release_mutex); + + if (++retries % 5 == 0) { +@@ -222,14 +232,14 @@ + timeout.tv_nsec = tv.tv_usec * 1000; + timeout.tv_sec = tv.tv_sec + max_wait_time; + +- Dmsg1(100, "JobId=%u going to wait for a device.\n", (uint32_t)jcr->JobId); ++ Dmsg0(dbglvl, "Going to wait for a device.\n"); + + /* Wait required time */ + stat = pthread_cond_timedwait(&wait_device_release, &device_release_mutex, &timeout); +- Dmsg2(100, "JobId=%u wokeup from sleep on device stat=%d\n", (uint32_t)jcr->JobId, stat); ++ Dmsg1(dbglvl, "Wokeup from sleep on device stat=%d\n", stat); + + V(device_release_mutex); +- Dmsg2(100, "JobId=%u return from wait_device ok=%d\n", (uint32_t)jcr->JobId, ok); ++ Dmsg1(dbglvl, "Return from wait_device ok=%d\n", ok); + return ok; + } + diff --git a/bacula/src/stored/wait.c b/bacula/src/stored/wait.c index f332de3e35..53fa5eba0f 100644 --- a/bacula/src/stored/wait.c +++ b/bacula/src/stored/wait.c @@ -40,8 +40,7 @@ #include "bacula.h" /* pull in global headers */ #include "stored.h" /* pull in Storage Deamon headers */ -//static bool double_jcr_wait_time(JCR *jcr); - +const int dbglvl = 400; /* * Wait for SysOp to mount a tape on a specific device @@ -62,7 +61,7 @@ int wait_for_sysop(DCR *dcr) JCR *jcr = dcr->jcr; dev->dlock(); - Dmsg1(100, "Enter blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "Enter blocked=%s\n", dev->print_blocked()); unmounted = is_device_unmounted(dev); dev->poll = false; @@ -84,27 +83,28 @@ int wait_for_sysop(DCR *dcr) } if (!unmounted) { - Dmsg1(400, "blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "blocked=%s\n", dev->print_blocked()); dev->dev_prev_blocked = dev->blocked(); dev->set_blocked(BST_WAITING_FOR_SYSOP); /* indicate waiting for mount */ } for ( ; !job_canceled(jcr); ) { - time_t now, start; + time_t now, start, total_waited; gettimeofday(&tv, &tz); timeout.tv_nsec = tv.tv_usec * 1000; timeout.tv_sec = tv.tv_sec + add_wait; - Dmsg4(400, "I'm going to sleep on device %s. HB=%d wait=%d add_wait=%d\n", - dev->print_name(), (int)me->heartbeat_interval, dev->wait_sec, add_wait); + Dmsg4(dbglvl, "I'm going to sleep on device %s. HB=%d rem_wait=%d add_wait=%d\n", + dev->print_name(), (int)me->heartbeat_interval, dev->rem_wait_sec, add_wait); start = time(NULL); /* Wait required time */ stat = pthread_cond_timedwait(&dev->wait_next_vol, &dev->m_mutex, &timeout); - Dmsg2(400, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, + Dmsg2(dbglvl, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, dev->print_blocked()); now = time(NULL); + total_waited = now - first_start; dev->rem_wait_sec -= (now - start); /* Note, this always triggers the first time. We want that. */ @@ -113,7 +113,7 @@ int wait_for_sysop(DCR *dcr) /* send heartbeats */ if (jcr->file_bsock) { jcr->file_bsock->signal(BNET_HEARTBEAT); - Dmsg0(400, "Send heartbeat to FD.\n"); + Dmsg0(dbglvl, "Send heartbeat to FD.\n"); } if (jcr->dir_bsock) { jcr->dir_bsock->signal(BNET_HEARTBEAT); @@ -131,7 +131,7 @@ int wait_for_sysop(DCR *dcr) if (dev->rem_wait_sec <= 0) { /* on exceeding wait time return */ - Dmsg0(400, "Exceed wait time.\n"); + Dmsg0(dbglvl, "Exceed wait time.\n"); stat = W_TIMEOUT; break; } @@ -142,8 +142,8 @@ int wait_for_sysop(DCR *dcr) unmounted = is_device_unmounted(dev); if (!unmounted && dev->vol_poll_interval && - (now - first_start >= dev->vol_poll_interval)) { - Dmsg1(400, "In wait blocked=%s\n", dev->print_blocked()); + (total_waited >= dev->vol_poll_interval)) { + Dmsg1(dbglvl, "poll return in wait blocked=%s\n", dev->print_blocked()); dev->poll = true; /* returning a poll event */ stat = W_POLL; break; @@ -152,6 +152,7 @@ int wait_for_sysop(DCR *dcr) * Check if user mounted the device while we were waiting */ if (dev->blocked() == BST_MOUNT) { /* mount request ? */ + Dmsg0(dbglvl, "Mounted return.\n"); stat = W_MOUNT; break; } @@ -160,30 +161,39 @@ int wait_for_sysop(DCR *dcr) * If we did not timeout, then some event happened, so * return to check if state changed. */ - if (stat != 0) { + if (stat != ETIMEDOUT) { + berrno be; + Dmsg2(dbglvl, "Wake return. stat=%d. ERR=%s\n", stat, be.bstrerror(stat)); stat = W_WAKE; /* someone woke us */ break; } /* * At this point, we know we woke up because of a timeout, - * that was due to a heartbeat, so we just update - * the wait counters and continue. + * that was due to a heartbeat, because any other reason would + * have caused us to return, so update the wait counters and continue. */ - add_wait = dev->wait_sec - (now - start); - if (add_wait < 0) { - add_wait = 0; - } + add_wait = dev->rem_wait_sec; if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { add_wait = me->heartbeat_interval; } + /* If the user did not unmount the tape and we are polling, ensure + * that we poll at the correct interval. + */ + if (!unmounted && dev->vol_poll_interval && + add_wait > dev->vol_poll_interval - total_waited) { + add_wait = dev->vol_poll_interval - total_waited; + } + if (add_wait < 0) { + add_wait = 0; + } } if (!unmounted) { dev->set_blocked(dev->dev_prev_blocked); /* restore entry state */ - Dmsg1(400, "set %s\n", dev->print_blocked()); + Dmsg1(dbglvl, "set %s\n", dev->print_blocked()); } - Dmsg1(400, "Exit blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "Exit blocked=%s\n", dev->print_blocked()); dev->dunlock(); return stat; } @@ -209,7 +219,7 @@ bool wait_for_device(JCR *jcr, int &retries) const int max_wait_time = 1 * 60; /* wait 1 minute */ char ed1[50]; - Dmsg0(100, "Enter wait_for_device\n"); + Dmsg0(dbglvl, "Enter wait_for_device\n"); P(device_release_mutex); if (++retries % 5 == 0) { @@ -222,14 +232,14 @@ bool wait_for_device(JCR *jcr, int &retries) timeout.tv_nsec = tv.tv_usec * 1000; timeout.tv_sec = tv.tv_sec + max_wait_time; - Dmsg1(100, "JobId=%u going to wait for a device.\n", (uint32_t)jcr->JobId); + Dmsg0(dbglvl, "Going to wait for a device.\n"); /* Wait required time */ stat = pthread_cond_timedwait(&wait_device_release, &device_release_mutex, &timeout); - Dmsg2(100, "JobId=%u wokeup from sleep on device stat=%d\n", (uint32_t)jcr->JobId, stat); + Dmsg1(dbglvl, "Wokeup from sleep on device stat=%d\n", stat); V(device_release_mutex); - Dmsg2(100, "JobId=%u return from wait_device ok=%d\n", (uint32_t)jcr->JobId, ok); + Dmsg1(dbglvl, "Return from wait_device ok=%d\n", ok); return ok; } diff --git a/bacula/technotes-2.3 b/bacula/technotes-2.3 index d3b78d4fbd..36a43a7703 100644 --- a/bacula/technotes-2.3 +++ b/bacula/technotes-2.3 @@ -1,6 +1,10 @@ Technical notes on version 2.3 General: +02Nov07 +kes Fix bug #942 where lots of emails where generated when the heartbeat + interval was low and Bacula wanted a different tape from the one in + the drive. 25Oct07 ebl Add new SD despooling attributes and Dir inserting attributes job status codes in the catalog (Status table). -- 2.39.5