2 This patch fixes bug #947 where a large number of emails were generated
3 because the heartbeat interval was small and the tape in the drive did
4 not correspond to the one wanted by Bacula.
6 Apply the patch to version 2.2.5 (and probably any 2.2.x version) with:
9 ./configure <your options>
10 patch -p0 <2.2.5-hb.patch
16 Index: src/stored/wait.c
17 ===================================================================
18 --- src/stored/wait.c (revision 5814)
19 +++ src/stored/wait.c (working copy)
21 #include "bacula.h" /* pull in global headers */
22 #include "stored.h" /* pull in Storage Deamon headers */
24 -//static bool double_jcr_wait_time(JCR *jcr);
25 +const int dbglvl = 400;
29 * Wait for SysOp to mount a tape on a specific device
35 - Dmsg1(100, "Enter blocked=%s\n", dev->print_blocked());
36 + Dmsg1(dbglvl, "Enter blocked=%s\n", dev->print_blocked());
37 unmounted = is_device_unmounted(dev);
44 - Dmsg1(400, "blocked=%s\n", dev->print_blocked());
45 + Dmsg1(dbglvl, "blocked=%s\n", dev->print_blocked());
46 dev->dev_prev_blocked = dev->blocked();
47 dev->set_blocked(BST_WAITING_FOR_SYSOP); /* indicate waiting for mount */
50 for ( ; !job_canceled(jcr); ) {
52 + time_t now, start, total_waited;
54 gettimeofday(&tv, &tz);
55 timeout.tv_nsec = tv.tv_usec * 1000;
56 timeout.tv_sec = tv.tv_sec + add_wait;
58 - Dmsg4(400, "I'm going to sleep on device %s. HB=%d wait=%d add_wait=%d\n",
59 - dev->print_name(), (int)me->heartbeat_interval, dev->wait_sec, add_wait);
60 + Dmsg4(dbglvl, "I'm going to sleep on device %s. HB=%d rem_wait=%d add_wait=%d\n",
61 + dev->print_name(), (int)me->heartbeat_interval, dev->rem_wait_sec, add_wait);
63 /* Wait required time */
64 stat = pthread_cond_timedwait(&dev->wait_next_vol, &dev->m_mutex, &timeout);
65 - Dmsg2(400, "Wokeup from sleep on device stat=%d blocked=%s\n", stat,
66 + Dmsg2(dbglvl, "Wokeup from sleep on device stat=%d blocked=%s\n", stat,
67 dev->print_blocked());
70 + total_waited = now - first_start;
71 dev->rem_wait_sec -= (now - start);
73 /* Note, this always triggers the first time. We want that. */
76 if (jcr->file_bsock) {
77 jcr->file_bsock->signal(BNET_HEARTBEAT);
78 - Dmsg0(400, "Send heartbeat to FD.\n");
79 + Dmsg0(dbglvl, "Send heartbeat to FD.\n");
82 jcr->dir_bsock->signal(BNET_HEARTBEAT);
86 if (dev->rem_wait_sec <= 0) { /* on exceeding wait time return */
87 - Dmsg0(400, "Exceed wait time.\n");
88 + Dmsg0(dbglvl, "Exceed wait time.\n");
93 unmounted = is_device_unmounted(dev);
95 if (!unmounted && dev->vol_poll_interval &&
96 - (now - first_start >= dev->vol_poll_interval)) {
97 - Dmsg1(400, "In wait blocked=%s\n", dev->print_blocked());
98 + (total_waited >= dev->vol_poll_interval)) {
99 + Dmsg1(dbglvl, "poll return in wait blocked=%s\n", dev->print_blocked());
100 dev->poll = true; /* returning a poll event */
104 * Check if user mounted the device while we were waiting
106 if (dev->blocked() == BST_MOUNT) { /* mount request ? */
107 + Dmsg0(dbglvl, "Mounted return.\n");
111 @@ -160,30 +161,39 @@
112 * If we did not timeout, then some event happened, so
113 * return to check if state changed.
116 + if (stat != ETIMEDOUT) {
118 + Dmsg2(dbglvl, "Wake return. stat=%d. ERR=%s\n", stat, be.bstrerror(stat));
119 stat = W_WAKE; /* someone woke us */
124 * At this point, we know we woke up because of a timeout,
125 - * that was due to a heartbeat, so we just update
126 - * the wait counters and continue.
127 + * that was due to a heartbeat, because any other reason would
128 + * have caused us to return, so update the wait counters and continue.
130 - add_wait = dev->wait_sec - (now - start);
131 + add_wait = dev->rem_wait_sec;
132 + if (me->heartbeat_interval && add_wait > me->heartbeat_interval) {
133 + add_wait = me->heartbeat_interval;
135 + /* If the user did not unmount the tape and we are polling, ensure
136 + * that we poll at the correct interval.
138 + if (!unmounted && dev->vol_poll_interval &&
139 + add_wait > dev->vol_poll_interval - total_waited) {
140 + add_wait = dev->vol_poll_interval - total_waited;
145 - if (me->heartbeat_interval && add_wait > me->heartbeat_interval) {
146 - add_wait = me->heartbeat_interval;
151 dev->set_blocked(dev->dev_prev_blocked); /* restore entry state */
152 - Dmsg1(400, "set %s\n", dev->print_blocked());
153 + Dmsg1(dbglvl, "set %s\n", dev->print_blocked());
155 - Dmsg1(400, "Exit blocked=%s\n", dev->print_blocked());
156 + Dmsg1(dbglvl, "Exit blocked=%s\n", dev->print_blocked());
161 const int max_wait_time = 1 * 60; /* wait 1 minute */
164 - Dmsg0(100, "Enter wait_for_device\n");
165 + Dmsg0(dbglvl, "Enter wait_for_device\n");
166 P(device_release_mutex);
168 if (++retries % 5 == 0) {
169 @@ -222,14 +232,14 @@
170 timeout.tv_nsec = tv.tv_usec * 1000;
171 timeout.tv_sec = tv.tv_sec + max_wait_time;
173 - Dmsg1(100, "JobId=%u going to wait for a device.\n", (uint32_t)jcr->JobId);
174 + Dmsg0(dbglvl, "Going to wait for a device.\n");
176 /* Wait required time */
177 stat = pthread_cond_timedwait(&wait_device_release, &device_release_mutex, &timeout);
178 - Dmsg2(100, "JobId=%u wokeup from sleep on device stat=%d\n", (uint32_t)jcr->JobId, stat);
179 + Dmsg1(dbglvl, "Wokeup from sleep on device stat=%d\n", stat);
181 V(device_release_mutex);
182 - Dmsg2(100, "JobId=%u return from wait_device ok=%d\n", (uint32_t)jcr->JobId, ok);
183 + Dmsg1(dbglvl, "Return from wait_device ok=%d\n", ok);