From: Kern Sibbald Date: Wed, 18 May 2005 10:16:33 +0000 (+0000) Subject: - Modify wait during use_device to happen only after all devices X-Git-Tag: Release-1.38.0~439 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=39bdd3c401f42f9d399a5740a5be03188856c692;p=bacula%2Fbacula - Modify wait during use_device to happen only after all devices have been examined rather than in the reserve_device code. git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@2061 91ce42f0-d328-0410-95d8-f526ca767f89 --- diff --git a/bacula/kernstodo b/bacula/kernstodo index e7401431b1..16a5945bdf 100644 --- a/bacula/kernstodo +++ b/bacula/kernstodo @@ -53,6 +53,8 @@ Document: - Document that ChangerDevice is used for Alert command. For 1.37: +- Fix 3993 error in SD. It forgets to look at autochanger + resource for device command, ... - --without-openssl breaks at least on Solaris. - Python: - Make a callback when Rerun failed levels is called. diff --git a/bacula/kes-1.37 b/bacula/kes-1.37 index b371bbde63..e8416a2907 100644 --- a/bacula/kes-1.37 +++ b/bacula/kes-1.37 @@ -3,6 +3,11 @@ General: +Changes to 1.37.19: +18May05 +- Modify wait during use_device to happen only after all devices + have been examined rather than in the reserve_device code. + Changes to 1.37.18: 16May05 - Add more debug to SD for Autochangers + status output. diff --git a/bacula/src/stored/acquire.c b/bacula/src/stored/acquire.c index 10cd5ef872..c287b55521 100644 --- a/bacula/src/stored/acquire.c +++ b/bacula/src/stored/acquire.c @@ -133,45 +133,37 @@ void free_dcr(DCR *dcr) * We "reserve" the drive by setting the ST_READ bit. No one else * should touch the drive until that is cleared. * This allows the DIR to "reserve" the device before actually - * starting the job. If the device is not available, the DIR - * can wait (to be implemented 1/05). + * starting the job. */ bool reserve_device_for_read(DCR *dcr) { DEVICE *dev = dcr->dev; JCR *jcr = dcr->jcr; - bool first; + bool ok = false; ASSERT(dcr); - init_device_wait_timers(dcr); - dev->block(BST_DOING_ACQUIRE); - Mmsg(jcr->errmsg, _("Device %s is BLOCKED due to user unmount.\n"), - dev->print_name()); - for (first=true; device_is_unmounted(dev); first=false) { - dev->unblock(); - if (!wait_for_device(dcr, jcr->errmsg, first)) { - return false; - } - dev->block(BST_DOING_ACQUIRE); + if (device_is_unmounted(dev)) { + Mmsg(jcr->errmsg, _("Device %s is BLOCKED due to user unmount.\n"), + dev->print_name()); + goto bail_out; } - Mmsg2(jcr->errmsg, _("Device %s is busy. Job %d canceled.\n"), - dev->print_name(), jcr->JobId); - for (first=true; dev->is_busy(); first=false) { - dev->unblock(); - if (!wait_for_device(dcr, jcr->errmsg, first)) { - return false; - } - dev->block(BST_DOING_ACQUIRE); + if (dev->is_busy()) { + Mmsg1(jcr->errmsg, _("Device %s is busy.\n"), + dev->print_name()); + goto bail_out; } dev->clear_append(); dev->set_read(); + ok = true; + +bail_out: dev->unblock(); - return true; + return ok; } @@ -372,56 +364,28 @@ bool reserve_device_for_append(DCR *dcr) JCR *jcr = dcr->jcr; DEVICE *dev = dcr->dev; bool ok = false; - bool first; ASSERT(dcr); - init_device_wait_timers(dcr); - dev->block(BST_DOING_ACQUIRE); - Mmsg1(jcr->errmsg, _("Device %s is busy reading.\n"), - dev->print_name()); - for (first=true; dev->can_read(); first=false) { - dev->unblock(); - if (!wait_for_device(dcr, jcr->errmsg, first)) { - return false; - } - dev->block(BST_DOING_ACQUIRE); + if (dev->can_read()) { + Mmsg1(jcr->errmsg, _("Device %s is busy reading.\n"), dev->print_name()); + goto bail_out; } - - Mmsg(jcr->errmsg, _("Device %s is BLOCKED due to user unmount.\n"), - dev->print_name()); - for (first=true; device_is_unmounted(dev); first=false) { - dev->unblock(); - if (!wait_for_device(dcr, jcr->errmsg, first)) { - return false; - } - dev->block(BST_DOING_ACQUIRE); + if (device_is_unmounted(dev)) { + Mmsg(jcr->errmsg, _("Device %s is BLOCKED due to user unmount.\n"), dev->print_name()); + goto bail_out; } Dmsg1(190, "reserve_append device is %s\n", dev->is_tape()?"tape":"disk"); - for ( ;; ) { - switch (can_reserve_drive(dcr)) { - case 0: - Mmsg1(jcr->errmsg, _("Device %s is busy writing on another Volume.\n"), dev->print_name()); - dev->unblock(); - if (!wait_for_device(dcr, jcr->errmsg, first)) { - return false; - } - dev->block(BST_DOING_ACQUIRE); - continue; - case -1: - goto bail_out; /* error */ - default: - break; /* OK, reserve drive */ - } - break; + if (can_reserve_drive(dcr) != 1) { + Mmsg1(jcr->errmsg, _("Device %s is busy writing on another Volume.\n"), dev->print_name()); + goto bail_out; } - dev->reserved_device++; dcr->reserved_device = true; ok = true; diff --git a/bacula/src/stored/dev.c b/bacula/src/stored/dev.c index b758b2373c..a43e07493c 100644 --- a/bacula/src/stored/dev.c +++ b/bacula/src/stored/dev.c @@ -1714,6 +1714,18 @@ void init_device_wait_timers(DCR *dcr) } +void init_jcr_device_wait_timers(JCR *jcr) +{ + /* ******FIXME******* put these on config variables */ + jcr->min_wait = 60 * 60; + jcr->max_wait = 24 * 60 * 60; + jcr->max_num_wait = 9; /* 5 waits =~ 1 day, then 1 day at a time */ + jcr->wait_sec = jcr->min_wait; + jcr->rem_wait_sec = jcr->wait_sec; + jcr->num_wait = 0; +} + + /* * The dev timers are used for waiting on a particular device * diff --git a/bacula/src/stored/dircmd.c b/bacula/src/stored/dircmd.c index acf10f6151..1e4b3b4c1d 100644 --- a/bacula/src/stored/dircmd.c +++ b/bacula/src/stored/dircmd.c @@ -258,7 +258,7 @@ static bool cancel_cmd(JCR *cjcr) if (sscanf(dir->msg, "cancel Job=%127s", Job) == 1) { if (!(jcr=get_jcr_by_full_name(Job))) { - bnet_fsend(dir, _("3992 Job %s not found.\n"), Job); + bnet_fsend(dir, _("3902 Job %s not found.\n"), Job); } else { P(jcr->mutex); oldStatus = jcr->JobStatus; @@ -279,7 +279,7 @@ static bool cancel_cmd(JCR *cjcr) free_jcr(jcr); } } else { - bnet_fsend(dir, _("3993 Error scanning cancel command.\n")); + bnet_fsend(dir, _("3903 Error scanning cancel command.\n")); } bnet_sig(dir, BNET_EOD); return 1; diff --git a/bacula/src/stored/job.c b/bacula/src/stored/job.c index c4510f1850..1b01cdc0f5 100644 --- a/bacula/src/stored/job.c +++ b/bacula/src/stored/job.c @@ -10,19 +10,14 @@ Copyright (C) 2000-2005 Kern Sibbald This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. + modify it under the terms of the GNU General Public License + version 2 as ammended with additional clauses defined in the + file LICENSE in the main source directory. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - MA 02111-1307, USA. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + the file LICENSE for additional details. */ @@ -52,7 +47,7 @@ static char use_device[] = "use device=%127s\n"; static char OKjob[] = "3000 OK Job SDid=%u SDtime=%u Authorization=%s\n"; static char OK_device[] = "3000 OK use device device=%s\n"; static char NO_device[] = "3924 Device \"%s\" not in SD Device resources.\n"; -static char NOT_open[] = "3925 Device \"%s\" could not be opened or does not exist.\n"; +//static char NOT_open[] = "3925 Device \"%s\" could not be opened or does not exist.\n"; static char BAD_use[] = "3913 Bad use command: %s\n"; static char BAD_job[] = "3915 Bad Job command: %s\n"; //static char OK_query[] = "3001 OK query\n"; @@ -344,15 +339,33 @@ static bool use_storage_cmd(JCR *jcr) * Wiffle through them and find one that can do the backup. */ if (ok) { - store = (DIRSTORE *)dirstore->first(); - foreach_alist(device_name, store->device) { - if (search_res_for_device(jcr, store, device_name, append) == 1) { - dcr = jcr->dcr; - dcr->Copy = Copy; - dcr->Stripe = Stripe; - ok = true; - goto done; + bool first = true; + init_jcr_device_wait_timers(jcr); + for ( ;; ) { + int need_wait = false; + foreach_alist(store, dirstore) { + foreach_alist(device_name, store->device) { + int stat; + stat = search_res_for_device(jcr, store, device_name, append); + if (stat == 1) { /* found available device */ + dcr = jcr->dcr; + dcr->Copy = Copy; + dcr->Stripe = Stripe; + ok = true; + goto done; + } else if (stat == 0) { /* device busy */ + need_wait = true; + } + } + } + /* + * If there is some device for which we can wait, then + * wait and try again until the wait time expires + */ + if (!need_wait || !wait_for_device(jcr, jcr->errmsg, first)) { + break; } + first = false; } if (verbose) { unbash_spaces(dir->msg); @@ -416,9 +429,7 @@ static int search_res_for_device(JCR *jcr, DIRSTORE *store, char *device_name, i Jmsg(jcr, M_WARNING, 0, _("\n" " Device \"%s\" requested by DIR could not be opened or does not exist.\n"), device_name); - bnet_fsend(dir, NOT_open, device_name); - Dmsg1(100, ">dird: %s\n", dir->msg); - return -1; + return 0; } Dmsg1(100, "Found device %s\n", device->hdr.name); dcr = new_dcr(jcr, device->dev); @@ -438,8 +449,6 @@ static int search_res_for_device(JCR *jcr, DIRSTORE *store, char *device_name, i ok = reserve_device_for_read(dcr); } if (!ok) { - bnet_fsend(dir, _("3927 Could not reserve device: %s\n"), device_name); - Dmsg1(100, ">dird: %s\n", dir->msg); free_dcr(jcr->dcr); return 0; } @@ -447,7 +456,7 @@ static int search_res_for_device(JCR *jcr, DIRSTORE *store, char *device_name, i bash_spaces(device_name); ok = bnet_fsend(dir, OK_device, device_name); Dmsg1(100, ">dird: %s\n", dir->msg); - return ok; + return ok ? 1 : -1; } } foreach_res(changer, R_AUTOCHANGER) { @@ -499,11 +508,11 @@ static int search_res_for_device(JCR *jcr, DIRSTORE *store, char *device_name, i bash_spaces(dev_name); ok = bnet_fsend(dir, OK_device, dev_name.c_str()); /* Return real device name */ Dmsg1(100, ">dird: %s\n", dir->msg); - return ok; + return ok ? 1 : -1; } } } - return 0; + return 0; /* nothing found */ } diff --git a/bacula/src/stored/protos.h b/bacula/src/stored/protos.h index 93171916f5..e4afc7cc0e 100644 --- a/bacula/src/stored/protos.h +++ b/bacula/src/stored/protos.h @@ -116,6 +116,7 @@ JCR *next_attached_jcr(DEVICE *dev, JCR *jcr); bool offline_or_rewind_dev(DEVICE *dev); bool reposition_dev(DEVICE *dev, uint32_t file, uint32_t block); void init_device_wait_timers(DCR *dcr); +void init_jcr_device_wait_timers(JCR *jcr); bool double_dev_wait_time(DEVICE *dev); /* Get info about device */ @@ -227,4 +228,4 @@ void list_spool_stats (BSOCK *bs); /* From wait.c */ int wait_for_sysop(DCR *dcr); -bool wait_for_device(DCR *dcr, const char *msg, bool first); +bool wait_for_device(JCR *jcr, const char *msg, bool first); diff --git a/bacula/src/stored/wait.c b/bacula/src/stored/wait.c index d863656ad3..aefac5cac8 100644 --- a/bacula/src/stored/wait.c +++ b/bacula/src/stored/wait.c @@ -12,19 +12,14 @@ Copyright (C) 2000-2005 Kern Sibbald This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. + modify it under the terms of the GNU General Public License + version 2 as ammended with additional clauses defined in the + file LICENSE in the main source directory. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, - MA 02111-1307, USA. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + the file LICENSE for additional details. */ @@ -54,14 +49,14 @@ int wait_for_sysop(DCR *dcr) P(dev->mutex); unmounted = (dev->dev_blocked == BST_UNMOUNTED) || - (dev->dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP); + (dev->dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP); dev->poll = false; /* - * Wait requested time (dev->rem_wait_sec). However, we also wake up every - * HB_TIME seconds and send a heartbeat to the FD and the Director - * to keep stateful firewalls from closing them down while waiting - * for the operator. + * Wait requested time (dev->rem_wait_sec). However, we also wake up every + * HB_TIME seconds and send a heartbeat to the FD and the Director + * to keep stateful firewalls from closing them down while waiting + * for the operator. */ add_wait = dev->rem_wait_sec; if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { @@ -87,7 +82,7 @@ int wait_for_sysop(DCR *dcr) timeout.tv_sec = tv.tv_sec + add_wait; Dmsg3(400, "I'm going to sleep on device %s. HB=%d wait=%d\n", dev->print_name(), - (int)me->heartbeat_interval, dev->wait_sec); + (int)me->heartbeat_interval, dev->wait_sec); start = time(NULL); /* Wait required time */ stat = pthread_cond_timedwait(&dev->wait_next_vol, &dev->mutex, &timeout); @@ -98,53 +93,53 @@ int wait_for_sysop(DCR *dcr) /* Note, this always triggers the first time. We want that. */ if (me->heartbeat_interval) { - if (now - last_heartbeat >= me->heartbeat_interval) { - /* send heartbeats */ - if (jcr->file_bsock) { - bnet_sig(jcr->file_bsock, BNET_HEARTBEAT); + if (now - last_heartbeat >= me->heartbeat_interval) { + /* send heartbeats */ + if (jcr->file_bsock) { + bnet_sig(jcr->file_bsock, BNET_HEARTBEAT); Dmsg0(400, "Send heartbeat to FD.\n"); - } - if (jcr->dir_bsock) { - bnet_sig(jcr->dir_bsock, BNET_HEARTBEAT); - } - last_heartbeat = now; - } + } + if (jcr->dir_bsock) { + bnet_sig(jcr->dir_bsock, BNET_HEARTBEAT); + } + last_heartbeat = now; + } } /* * Check if user unmounted the device while we were waiting */ unmounted = (dev->dev_blocked == BST_UNMOUNTED) || - (dev->dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP); + (dev->dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP); - if (stat != ETIMEDOUT) { /* we blocked the device */ - break; /* on error return */ + if (stat != ETIMEDOUT) { /* we blocked the device */ + break; /* on error return */ } if (dev->rem_wait_sec <= 0) { /* on exceeding wait time return */ Dmsg0(400, "Exceed wait time.\n"); - break; + break; } if (!unmounted && dev->vol_poll_interval && - (now - first_start >= dev->vol_poll_interval)) { + (now - first_start >= dev->vol_poll_interval)) { Dmsg1(400, "In wait blocked=%s\n", edit_blocked_reason(dev)); - dev->poll = true; /* returning a poll event */ - break; + dev->poll = true; /* returning a poll event */ + break; } /* * Check if user mounted the device while we were waiting */ if (dev->dev_blocked == BST_MOUNT) { /* mount request ? */ - stat = 0; - break; + stat = 0; + break; } add_wait = dev->wait_sec - (now - start); if (add_wait < 0) { - add_wait = 0; + add_wait = 0; } if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { - add_wait = me->heartbeat_interval; + add_wait = me->heartbeat_interval; } } @@ -157,10 +152,13 @@ int wait_for_sysop(DCR *dcr) /* - * Wait for Device to be released - * + * Wait for any device to be released, then we return, so + * higher level code can rescan possible devices. + * + * Returns: true if a device has changed state + * false if the total wait time has expired. */ -bool wait_for_device(DCR *dcr, const char *msg, bool first) +bool wait_for_device(JCR *jcr, const char *msg, bool first) { struct timeval tv; struct timezone tz; @@ -168,8 +166,6 @@ bool wait_for_device(DCR *dcr, const char *msg, bool first) // time_t last_heartbeat = 0; int stat = 0; int add_wait; - DEVICE *dev = dcr->dev; - JCR *jcr = dcr->jcr; bool ok = false; Dmsg0(100, "Enter wait_for_device\n"); @@ -180,10 +176,10 @@ bool wait_for_device(DCR *dcr, const char *msg, bool first) } /* - * Wait requested time (dev->rem_wait_sec). However, we also wake up every - * HB_TIME seconds and send a heartbeat to the FD and the Director - * to keep stateful firewalls from closing them down while waiting - * for the operator. + * Wait requested time (dev->rem_wait_sec). However, we also wake up every + * HB_TIME seconds and send a heartbeat to the FD and the Director + * to keep stateful firewalls from closing them down while waiting + * for the operator. */ add_wait = jcr->rem_wait_sec; if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { @@ -197,8 +193,8 @@ bool wait_for_device(DCR *dcr, const char *msg, bool first) timeout.tv_nsec = tv.tv_usec * 1000; timeout.tv_sec = tv.tv_sec + add_wait; - Dmsg4(100, "I'm going to sleep on device %s. HB=%d wait=%d remwait=%d\n", dev->print_name(), - (int)me->heartbeat_interval, jcr->wait_sec, jcr->rem_wait_sec); + Dmsg3(100, "I'm going to wait for a device. HB=%d wait=%d remwait=%d\n", + (int)me->heartbeat_interval, jcr->wait_sec, jcr->rem_wait_sec); start = time(NULL); /* Wait required time */ stat = pthread_cond_timedwait(&wait_device_release, &device_release_mutex, &timeout); @@ -210,38 +206,38 @@ bool wait_for_device(DCR *dcr, const char *msg, bool first) #ifdef needed /* Note, this always triggers the first time. We want that. */ if (me->heartbeat_interval) { - if (now - last_heartbeat >= me->heartbeat_interval) { - /* send heartbeats */ - if (jcr->file_bsock) { - bnet_sig(jcr->file_bsock, BNET_HEARTBEAT); + if (now - last_heartbeat >= me->heartbeat_interval) { + /* send heartbeats */ + if (jcr->file_bsock) { + bnet_sig(jcr->file_bsock, BNET_HEARTBEAT); Dmsg0(400, "Send heartbeat to FD.\n"); - } - if (jcr->dir_bsock) { - bnet_sig(jcr->dir_bsock, BNET_HEARTBEAT); - } - last_heartbeat = now; - } + } + if (jcr->dir_bsock) { + bnet_sig(jcr->dir_bsock, BNET_HEARTBEAT); + } + last_heartbeat = now; + } } #endif - if (stat != ETIMEDOUT) { /* if someone woke us up */ - ok = true; - break; /* allow caller to examine device */ + if (stat != ETIMEDOUT) { /* if someone woke us up */ + ok = true; + break; /* allow caller to examine device */ } if (jcr->rem_wait_sec <= 0) { /* on exceeding wait time return */ Dmsg0(400, "Exceed wait time.\n"); - if (!double_jcr_wait_time(jcr)) { - break; /* give up */ - } - Jmsg(jcr, M_MOUNT, 0, msg); + if (!double_jcr_wait_time(jcr)) { + break; /* give up */ + } + Jmsg(jcr, M_MOUNT, 0, msg); } add_wait = jcr->wait_sec - (now - start); if (add_wait < 0) { - add_wait = 0; + add_wait = 0; } if (me->heartbeat_interval && add_wait > me->heartbeat_interval) { - add_wait = me->heartbeat_interval; + add_wait = me->heartbeat_interval; } } @@ -254,11 +250,11 @@ bool wait_for_device(DCR *dcr, const char *msg, bool first) * The jcr timers are used for waiting on any device * * Returns: true if time doubled - * false if max time expired + * false if max time expired */ static bool double_jcr_wait_time(JCR *jcr) { - jcr->wait_sec *= 2; /* double wait time */ + jcr->wait_sec *= 2; /* double wait time */ if (jcr->wait_sec > jcr->max_wait) { /* but not longer than maxtime */ jcr->wait_sec = jcr->max_wait; } diff --git a/bacula/src/version.h b/bacula/src/version.h index 3234be1da4..8bf04ac47c 100644 --- a/bacula/src/version.h +++ b/bacula/src/version.h @@ -1,8 +1,8 @@ /* */ #undef VERSION -#define VERSION "1.37.18" -#define BDATE "16 May 2005" -#define LSMDATE "16May05" +#define VERSION "1.37.19" +#define BDATE "18 May 2005" +#define LSMDATE "18May05" /* Debug flags */ #undef DEBUG