X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=bacula%2Fsrc%2Fstored%2Fwait.c;h=53fa5eba0f73a7658390238a7cbfca87fd7fa32d;hb=bdcddd32cafd978761406b7ad3ca006597c8f1d7;hp=cd034ef1f5050102fb05eb87a611d125409d4e5b;hpb=ac35fe129014f8b9b2a111b0601aa748c44846db;p=bacula%2Fbacula diff --git a/bacula/src/stored/wait.c b/bacula/src/stored/wait.c index cd034ef1f5..53fa5eba0f 100644 --- a/bacula/src/stored/wait.c +++ b/bacula/src/stored/wait.c @@ -7,8 +7,8 @@ many others, a complete list can be found in the file AUTHORS. This program is Free Software; you can redistribute it and/or modify it under the terms of version two of the GNU General Public - License as published by the Free Software Foundation plus additions - that are listed in the file LICENSE. + License as published by the Free Software Foundation and included + in the file LICENSE. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -40,8 +40,7 @@ #include "bacula.h" /* pull in global headers */ #include "stored.h" /* pull in Storage Deamon headers */ -//static bool double_jcr_wait_time(JCR *jcr); - +const int dbglvl = 400; /* * Wait for SysOp to mount a tape on a specific device @@ -62,7 +61,7 @@ int wait_for_sysop(DCR *dcr) JCR *jcr = dcr->jcr; dev->dlock(); - Dmsg1(100, "Enter blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "Enter blocked=%s\n", dev->print_blocked()); unmounted = is_device_unmounted(dev); dev->poll = false; @@ -84,27 +83,28 @@ int wait_for_sysop(DCR *dcr) } if (!unmounted) { - Dmsg1(400, "blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "blocked=%s\n", dev->print_blocked()); dev->dev_prev_blocked = dev->blocked(); dev->set_blocked(BST_WAITING_FOR_SYSOP); /* indicate waiting for mount */ } for ( ; !job_canceled(jcr); ) { - time_t now, start; + time_t now, start, total_waited; gettimeofday(&tv, &tz); timeout.tv_nsec = tv.tv_usec * 1000; timeout.tv_sec = tv.tv_sec + add_wait; - Dmsg4(400, "I'm going to sleep on device %s. HB=%d wait=%d add_wait=%d\n", - dev->print_name(), (int)me->heartbeat_interval, dev->wait_sec, add_wait); + Dmsg4(dbglvl, "I'm going to sleep on device %s. HB=%d rem_wait=%d add_wait=%d\n", + dev->print_name(), (int)me->heartbeat_interval, dev->rem_wait_sec, add_wait); start = time(NULL); /* Wait required time */ stat = pthread_cond_timedwait(&dev->wait_next_vol, &dev->m_mutex, &timeout); - Dmsg2(400, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, + Dmsg2(dbglvl, "Wokeup from sleep on device stat=%d blocked=%s\n", stat, dev->print_blocked()); now = time(NULL); + total_waited = now - first_start; dev->rem_wait_sec -= (now - start); /* Note, this always triggers the first time. We want that. */ @@ -112,11 +112,11 @@ int wait_for_sysop(DCR *dcr) if (now - last_heartbeat >= me->heartbeat_interval) { /* send heartbeats */ if (jcr->file_bsock) { - bnet_sig(jcr->file_bsock, BNET_HEARTBEAT); - Dmsg0(400, "Send heartbeat to FD.\n"); + jcr->file_bsock->signal(BNET_HEARTBEAT); + Dmsg0(dbglvl, "Send heartbeat to FD.\n"); } if (jcr->dir_bsock) { - bnet_sig(jcr->dir_bsock, BNET_HEARTBEAT); + jcr->dir_bsock->signal(BNET_HEARTBEAT); } last_heartbeat = now; } @@ -124,14 +124,14 @@ int wait_for_sysop(DCR *dcr) if (stat == EINVAL) { berrno be; - Jmsg1(jcr, M_FATAL, 0, _("pthread timedwait error. ERR=%s\n"), be.strerror(stat)); + Jmsg1(jcr, M_FATAL, 0, _("pthread timedwait error. ERR=%s\n"), be.bstrerror(stat)); stat = W_ERROR; /* error */ break; } if (dev->rem_wait_sec <= 0) { /* on exceeding wait time return */ - Dmsg0(400, "Exceed wait time.\n"); + Dmsg0(dbglvl, "Exceed wait time.\n"); stat = W_TIMEOUT; break; } @@ -142,8 +142,8 @@ int wait_for_sysop(DCR *dcr) unmounted = is_device_unmounted(dev); if (!unmounted && dev->vol_poll_interval && - (now - first_start >= dev->vol_poll_interval)) { - Dmsg1(400, "In wait blocked=%s\n", dev->print_blocked()); + (total_waited >= dev->vol_poll_interval)) { + Dmsg1(dbglvl, "poll return in wait blocked=%s\n", dev->print_blocked()); dev->poll = true; /* returning a poll event */ stat = W_POLL; break; @@ -152,6 +152,7 @@ int wait_for_sysop(DCR *dcr) * Check if user mounted the device while we were waiting */ if (dev->blocked() == BST_MOUNT) { /* mount request ? */ + Dmsg0(dbglvl, "Mounted return.\n"); stat = W_MOUNT; break; } @@ -160,30 +161,39 @@ int wait_for_sysop(DCR *dcr) * If we did not timeout, then some event happened, so * return to check if state changed. */ - if (stat != 0) { + if (stat != ETIMEDOUT) { + berrno be; + Dmsg2(dbglvl, "Wake return. stat=%d. ERR=%s\n", stat, be.bstrerror(stat)); stat = W_WAKE; /* someone woke us */ break; } /* * At this point, we know we woke up because of a timeout, - * that was due to a heartbeat, so we just update - * the wait counters and continue. + * that was due to a heartbeat, because any other reason would + * have caused us to return, so update the wait counters and continue. */ - add_wait = dev->wait_sec - (now - start); - if (add_wait < 0) { - add_wait = 0; - } + add_wait = dev->rem_wait_sec; if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { add_wait = me->heartbeat_interval; } + /* If the user did not unmount the tape and we are polling, ensure + * that we poll at the correct interval. + */ + if (!unmounted && dev->vol_poll_interval && + add_wait > dev->vol_poll_interval - total_waited) { + add_wait = dev->vol_poll_interval - total_waited; + } + if (add_wait < 0) { + add_wait = 0; + } } if (!unmounted) { dev->set_blocked(dev->dev_prev_blocked); /* restore entry state */ - Dmsg1(400, "set %s\n", dev->print_blocked()); + Dmsg1(dbglvl, "set %s\n", dev->print_blocked()); } - Dmsg1(400, "Exit blocked=%s\n", dev->print_blocked()); + Dmsg1(dbglvl, "Exit blocked=%s\n", dev->print_blocked()); dev->dunlock(); return stat; } @@ -191,40 +201,45 @@ int wait_for_sysop(DCR *dcr) /* * Wait for any device to be released, then we return, so - * higher level code can rescan possible devices. + * higher level code can rescan possible devices. Since there + * could be a job waiting for a drive to free up, we wait a maximum + * of 1 minute then retry just in case a broadcast was lost, and + * we return to rescan the devices. * * Returns: true if a device has changed state * false if the total wait time has expired. */ -bool wait_for_device(JCR *jcr, bool first) +bool wait_for_device(JCR *jcr, int &retries) { struct timeval tv; struct timezone tz; struct timespec timeout; int stat = 0; bool ok = true; - const int wait_time = 5 * 60; /* wait 5 minutes */ + const int max_wait_time = 1 * 60; /* wait 1 minute */ + char ed1[50]; - Dmsg0(100, "Enter wait_for_device\n"); + Dmsg0(dbglvl, "Enter wait_for_device\n"); P(device_release_mutex); - if (first) { - Jmsg(jcr, M_MOUNT, 0, _("Job %s waiting to reserve a device.\n"), jcr->Job); + if (++retries % 5 == 0) { + /* Print message every 5 minutes */ + Jmsg(jcr, M_MOUNT, 0, _("JobId=%s, Job %s waiting to reserve a device.\n"), + edit_uint64(jcr->JobId, ed1), jcr->Job); } gettimeofday(&tv, &tz); timeout.tv_nsec = tv.tv_usec * 1000; - timeout.tv_sec = tv.tv_sec + wait_time; + timeout.tv_sec = tv.tv_sec + max_wait_time; - Dmsg0(100, "I'm going to wait for a device.\n"); + Dmsg0(dbglvl, "Going to wait for a device.\n"); /* Wait required time */ stat = pthread_cond_timedwait(&wait_device_release, &device_release_mutex, &timeout); - Dmsg1(100, "Wokeup from sleep on device stat=%d\n", stat); - + Dmsg1(dbglvl, "Wokeup from sleep on device stat=%d\n", stat); V(device_release_mutex); - Dmsg1(100, "Return from wait_device ok=%d\n", ok); + Dmsg1(dbglvl, "Return from wait_device ok=%d\n", ok); return ok; }