--- /dev/null
+ This patch should fix a Segfault bug when a job is rescheduled.
+ The storage pointers were being released when they should not
+ have been.
+
+ Apply the patch with:
+
+ patch -p0 <1.36.2-reschedule.patch
+ make
+ ...
+
+Index: src/dird/dird.h
+===================================================================
+RCS file: /cvsroot/bacula/bacula/src/dird/dird.h,v
+retrieving revision 1.7
+diff -u -r1.7 dird.h
+--- src/dird/dird.h 19 Apr 2004 14:27:00 -0000 1.7
++++ src/dird/dird.h 18 Mar 2005 17:39:38 -0000
+@@ -45,3 +45,4 @@
+
+ /* From job.c */
+ void dird_free_jcr(JCR *jcr);
++void dird_free_jcr_pointers(JCR *jcr);
+Index: src/dird/job.c
+===================================================================
+RCS file: /cvsroot/bacula/bacula/src/dird/job.c,v
+retrieving revision 1.92.2.2
+diff -u -r1.92.2.2 job.c
+--- src/dird/job.c 27 Feb 2005 21:53:28 -0000 1.92.2.2
++++ src/dird/job.c 18 Mar 2005 17:39:38 -0000
+@@ -676,15 +676,9 @@
+ }
+ }
+
+-/*
+- * Free the Job Control Record if no one is still using it.
+- * Called from main free_jcr() routine in src/lib/jcr.c so
+- * that we can do our Director specific cleanup of the jcr.
+- */
+-void dird_free_jcr(JCR *jcr)
++/* Called directly from job rescheduling */
++void dird_free_jcr_pointers(JCR *jcr)
+ {
+- Dmsg0(200, "Start dird free_jcr\n");
+-
+ if (jcr->sd_auth_key) {
+ free(jcr->sd_auth_key);
+ jcr->sd_auth_key = NULL;
+@@ -723,7 +717,21 @@
+ }
+ if (jcr->term_wait_inited) {
+ pthread_cond_destroy(&jcr->term_wait);
++ jcr->term_wait_inited = false;
+ }
++}
++
++/*
++ * Free the Job Control Record if no one is still using it.
++ * Called from main free_jcr() routine in src/lib/jcr.c so
++ * that we can do our Director specific cleanup of the jcr.
++ */
++void dird_free_jcr(JCR *jcr)
++{
++ Dmsg0(200, "Start dird free_jcr\n");
++
++ dird_free_jcr_pointers(jcr);
++
+ /* Delete lists setup to hold storage pointers */
+ for (int i=0; i<MAX_STORE; i++) {
+ if (jcr->storage[i]) {
+Index: src/dird/jobq.c
+===================================================================
+RCS file: /cvsroot/bacula/bacula/src/dird/jobq.c,v
+retrieving revision 1.25.4.2
+diff -u -r1.25.4.2 jobq.c
+--- src/dird/jobq.c 15 Feb 2005 11:51:03 -0000 1.25.4.2
++++ src/dird/jobq.c 18 Mar 2005 17:39:38 -0000
+@@ -481,7 +481,7 @@
+ bstrftime(dt, sizeof(dt), time(NULL));
+ Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds.\n"),
+ jcr->Job, dt, (int)jcr->job->RescheduleInterval);
+- dird_free_jcr(jcr); /* partial cleanup old stuff */
++ dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */
+ jcr->JobStatus = JS_WaitStartTime;
+ jcr->SDJobStatus = 0;
+ if (jcr->JobBytes == 0) {
Bacula truncating tapes after a restore.
Note that all source files will be rebuilt during the make.
-
18Mar05 1.36.2-store.patch
This patch fails a job if no Storage resource is specified and
the job attempts to call the SD.
+
+18Mar05 1.36.2-reschedule.patch
+ This patch should fix a Segfault bug when a job is rescheduled.
+ The storage pointers were being released when they should not
+ have been.
init_job_server(director->MaxConcurrentJobs);
- init_device_resources();
+// init_device_resources();
Dmsg0(200, "wait for next job\n");
/* Main loop -- call scheduler to get next job to run */
berrno be;
Emsg1(M_ABORT, 0, _("Could not init job queue: ERR=%s\n"), be.strerror(stat));
}
- if ((wd = new_watchdog()) == NULL) {
- Emsg0(M_ABORT, 0, _("Could not init job monitor watchdogs\n"));
- }
+ wd = new_watchdog();
wd->callback = job_monitor_watchdog;
wd->destructor = job_monitor_destructor;
wd->one_shot = false;
uint32_t read_EndFile;
uint32_t read_StartBlock;
uint32_t read_EndBlock;
+ /* Device wait times */
+ int min_wait;
+ int max_wait;
+ int max_num_wait;
+ int wait_sec;
+ int rem_wait_sec;
+ int num_wait;
#endif /* STORAGE_DAEMON */
*
*/
/*
- Copyright (C) 2000-2004 Kern Sibbald and John Walker
+ Copyright (C) 2000-2005 Kern Sibbald
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
/*
- * Bacula thread watchdog routine. General routine that monitors
- * the daemon and signals a thread if it is blocked on a BSOCK
- * too long. This prevents catastropic long waits -- generally
- * due to Windows "hanging" the app.
+ * Bacula thread watchdog routine. General routine that
+ * allows setting a watchdog timer with a callback that is
+ * called when the timer goes off.
*
* Kern Sibbald, January MMII
*
*/
/*
- Copyright (C) 2000-2004 Kern Sibbald and John Walker
+ Copyright (C) 2000-2005 Kern Sibbald
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
return ok;
}
+/*
+ * This is the thread that walks the watchdog queue
+ * and when a queue item fires, the callback is
+ * invoked. If it is a one shot, the queue item
+ * is moved to the inactive queue.
+ */
extern "C" void *watchdog_thread(void *arg)
{
struct timespec timeout;
label.c match_bsr.c mount.c parse_bsr.c \
python.c \
read.c read_record.c record.c \
- spool.c status.c stored_conf.c
+ spool.c status.c stored_conf.c wait.c
SVROBJS = stored.o ansi_label.o \
autochanger.o acquire.o append.o \
askdir.o authenticate.o \
label.o match_bsr.o mount.o parse_bsr.o \
python.o \
read.o read_record.o record.o \
- spool.o status.o stored_conf.o
+ spool.o status.o stored_conf.o wait.o
# btape
TAPESRCS = btape.c block.c butil.c dev.c device.c label.c \
static char OK_create[] = "1000 OK CreateJobMedia\n";
-/* Forward referenced functions */
-static int wait_for_sysop(DCR *dcr);
-
/* Send update information about a device to Director */
bool dir_update_device(JCR *jcr, DEVICE *dev)
{
if (job_canceled(jcr)) {
Mmsg(dev->errmsg,
_("Job %s canceled while waiting for mount on Storage Device \"%s\".\n"),
- jcr->Job, dcr->dev_name);
+ jcr->Job, dev->print_name());
Jmsg(jcr, M_INFO, 0, "%s", dev->errmsg);
return false;
}
jstat = JS_WaitMount;
if (!dev->poll) {
Jmsg(jcr, M_MOUNT, 0, _(
-"Please mount Volume \"%s\" on Storage Device \"%s\" for Job %s\n"
+"Please mount Volume \"%s\" on Storage Device %s for Job %s\n"
"Use \"mount\" command to release Job.\n"),
- dcr->VolumeName, dcr->dev_name, jcr->Job);
+ dcr->VolumeName, dev->print_name(), jcr->Job);
Dmsg3(400, "Mount %s on %s for Job %s\n",
dcr->VolumeName, dcr->dev_name, jcr->Job);
}
" Media type: %s\n"
" Pool: %s\n"),
jcr->Job,
- dcr->dev_name,
+ dev->print_name(),
dcr->media_type,
dcr->pool_name);
}
ASSERT(dev->dev_blocked);
for ( ;; ) {
if (job_canceled(jcr)) {
- Mmsg(dev->errmsg, _("Job %s canceled while waiting for mount on Storage Device \"%s\".\n"),
- jcr->Job, dcr->dev_name);
+ Mmsg(dev->errmsg, _("Job %s canceled while waiting for mount on Storage Device %s.\n"),
+ jcr->Job, dev->print_name());
return false;
}
if (!dev->poll) {
msg = _("Please mount");
- Jmsg(jcr, M_MOUNT, 0, _("%s Volume \"%s\" on Storage Device \"%s\" for Job %s\n"),
- msg, dcr->VolumeName, dcr->dev_name, jcr->Job);
+ Jmsg(jcr, M_MOUNT, 0, _("%s Volume \"%s\" on Storage Device %s for Job %s\n"),
+ msg, dcr->VolumeName, dev->print_name(), jcr->Job);
Dmsg3(400, "Mount \"%s\" on device \"%s\" for Job %s\n",
dcr->VolumeName, dcr->dev_name, jcr->Job);
}
Dmsg0(400, "leave dir_ask_sysop_to_mount_volume\n");
return true;
}
-
-/*
- * Wait for SysOp to mount a tape on a specific device
- */
-static int wait_for_sysop(DCR *dcr)
-{
- struct timeval tv;
- struct timezone tz;
- struct timespec timeout;
- time_t last_heartbeat = 0;
- time_t first_start = time(NULL);
- int stat = 0;
- int add_wait;
- bool unmounted;
- DEVICE *dev = dcr->dev;
- JCR *jcr = dcr->jcr;
-
- P(dev->mutex);
- unmounted = (dev->dev_blocked == BST_UNMOUNTED) ||
- (dev->dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP);
-
- dev->poll = false;
- /*
- * Wait requested time (dev->rem_wait_sec). However, we also wake up every
- * HB_TIME seconds and send a heartbeat to the FD and the Director
- * to keep stateful firewalls from closing them down while waiting
- * for the operator.
- */
- add_wait = dev->rem_wait_sec;
- if (me->heartbeat_interval && add_wait > me->heartbeat_interval) {
- add_wait = me->heartbeat_interval;
- }
- /* If the user did not unmount the tape and we are polling, ensure
- * that we poll at the correct interval.
- */
- if (!unmounted && dev->vol_poll_interval && add_wait > dev->vol_poll_interval) {
- add_wait = dev->vol_poll_interval;
- }
- gettimeofday(&tv, &tz);
- timeout.tv_nsec = tv.tv_usec * 1000;
- timeout.tv_sec = tv.tv_sec + add_wait;
-
- if (!unmounted) {
- dev->dev_prev_blocked = dev->dev_blocked;
- dev->dev_blocked = BST_WAITING_FOR_SYSOP; /* indicate waiting for mount */
- }
-
- for ( ; !job_canceled(jcr); ) {
- time_t now, start;
-
- Dmsg3(400, "I'm going to sleep on device %s. HB=%d wait=%d\n", dev->print_name(),
- (int)me->heartbeat_interval, dev->wait_sec);
- start = time(NULL);
- /* Wait required time */
- stat = pthread_cond_timedwait(&dev->wait_next_vol, &dev->mutex, &timeout);
- Dmsg1(400, "Wokeup from sleep on device stat=%d\n", stat);
-
- now = time(NULL);
- dev->rem_wait_sec -= (now - start);
-
- /* Note, this always triggers the first time. We want that. */
- if (me->heartbeat_interval) {
- if (now - last_heartbeat >= me->heartbeat_interval) {
- /* send heartbeats */
- if (jcr->file_bsock) {
- bnet_sig(jcr->file_bsock, BNET_HEARTBEAT);
- Dmsg0(400, "Send heartbeat to FD.\n");
- }
- if (jcr->dir_bsock) {
- bnet_sig(jcr->dir_bsock, BNET_HEARTBEAT);
- }
- last_heartbeat = now;
- }
- }
-
- /*
- * Check if user unmounted the device while we were waiting
- */
- unmounted = (dev->dev_blocked == BST_UNMOUNTED) ||
- (dev->dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP);
-
- if (stat != ETIMEDOUT) { /* we blocked the device */
- break; /* on error return */
- }
- if (dev->rem_wait_sec <= 0) { /* on exceeding wait time return */
- Dmsg0(400, "Exceed wait time.\n");
- break;
- }
-
- if (!unmounted && dev->vol_poll_interval &&
- (now - first_start >= dev->vol_poll_interval)) {
- Dmsg1(400, "In wait blocked=%s\n", edit_blocked_reason(dev));
- dev->poll = true; /* returning a poll event */
- break;
- }
- /*
- * Check if user mounted the device while we were waiting
- */
- if (dev->dev_blocked == BST_MOUNT) { /* mount request ? */
- stat = 0;
- break;
- }
-
- add_wait = dev->wait_sec - (now - start);
- if (add_wait < 0) {
- add_wait = 0;
- }
- if (me->heartbeat_interval && add_wait > me->heartbeat_interval) {
- add_wait = me->heartbeat_interval;
- }
- gettimeofday(&tv, &tz);
- timeout.tv_nsec = tv.tv_usec * 1000;
- timeout.tv_sec = tv.tv_sec + add_wait; /* additional wait */
- Dmsg1(400, "Additional wait %d sec.\n", add_wait);
- }
-
- if (!unmounted) {
- dev->dev_blocked = dev->dev_prev_blocked; /* restore entry state */
- }
- V(dev->mutex);
- return stat;
-}
(dev_blocked == BST_UNMOUNTED ||
dev_blocked == BST_WAITING_FOR_SYSOP ||
dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP); };
-
+ bool waiting_for_mount() const { return
+ (dev_blocked == BST_UNMOUNTED ||
+ dev_blocked == BST_WAITING_FOR_SYSOP ||
+ dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP); };
bool weof() { return !weof_dev(this, 1); };
bool rewind() { return rewind_dev(this); };
const char *strerror() const;
void set_eof(); /* in dev.c */
void set_eot(); /* in dev.c */
void set_append() { state |= ST_APPEND; };
+ void set_label() { state |= ST_LABEL; };
void set_read() { state |= ST_READ; };
void set_offline() { state |= ST_OFFLINE; };
- void clear_append();
- void clear_read();
- void clear_label();
- void clear_offline();
+ void clear_append() { state &= ~ST_APPEND; };
+ void clear_read() { state &= ~ST_READ; };
+ void clear_label() { state &= ~ST_LABEL; };
+ void clear_offline() { state &= ~ST_OFFLINE; };
void clear_eot() { state &= ~ST_EOT; };
};
inline int DEVICE::at_eot() const { return state & ST_EOT; }
inline int DEVICE::can_append() const { return state & ST_APPEND; }
inline int DEVICE::can_read() const { return state & ST_READ; }
-inline void DEVICE::clear_append() { state &= ~ST_APPEND; }
-inline void DEVICE::clear_read() { state &= ~ST_READ; }
-inline void DEVICE::clear_label() { state &= ~ST_LABEL; }
-inline void DEVICE::clear_offline() { state &= ~ST_OFFLINE; }
inline const char *DEVICE::strerror() const { return errmsg; }
inline const char *DEVICE::archive_name() const { return dev_name; }
inline const char *DEVICE::print_name() const { return prt_name; }
return NULL;
}
+ /*
+ * This is a connection from the Director, so setup a JCR
+ */
Dmsg0(110, "Start Dir Job\n");
jcr = new_jcr(sizeof(JCR), stored_free_jcr); /* create Job Control Record */
jcr->dir_bsock = bs; /* save Director bsock */
bnet_sig(jcr->file_bsock, BNET_TERMINATE);
}
/* If thread waiting on mount, wake him */
- if (jcr->dcr && jcr->dcr->dev &&
- (jcr->dcr->dev->dev_blocked == BST_WAITING_FOR_SYSOP ||
- jcr->dcr->dev->dev_blocked == BST_UNMOUNTED ||
- jcr->dcr->dev->dev_blocked == BST_UNMOUNTED_WAITING_FOR_SYSOP)) {
+ if (jcr->dcr && jcr->dcr->dev && jcr->dcr->dev->waiting_for_mount()) {
pthread_cond_signal(&jcr->dcr->dev->wait_next_vol);
}
bnet_fsend(dir, _("3000 Job %s marked to be canceled.\n"), jcr->Job);
if (!ok) {
if (forge_on || jcr->ignore_label_errors) {
- dev->state |= ST_LABEL; /* set has Bacula label */
+ dev->set_label(); /* set has Bacula label */
Jmsg(jcr, M_ERROR, 0, "%s", jcr->errmsg);
return VOL_OK;
}
return VOL_LABEL_ERROR;
}
- dev->state |= ST_LABEL; /* set has Bacula label */
+ dev->set_label(); /* set has Bacula label */
/* Compare Volume Names */
Dmsg2(30, "Compare Vol names: VolName=%s hdr=%s\n", VolName?VolName:"*", dev->VolHdr.VolName);
DEVICE *dev = dcr->dev;
JCR *jcr = dcr->jcr;
Dmsg3(100, "Enter read_dev_volume_label_guess device=%s vol=%s dev_Vol=%s\n",
- dev->archive_name(), dcr->VolumeName, dev->VolHdr.VolName);
+ dev->print_name(), dcr->VolumeName, dev->VolHdr.VolName);
if (!dev->is_dvd()) {
Dmsg0(100, "Leave read_dev_volume_label_guess !CAP_REQMOUNT\n");
if (write && dev->free_space_errno < 0) {
Dmsg0(100, "Leave read_dev_volume_label_guess !free_space VOL_NO_MEDIA\n");
- Mmsg2(jcr->errmsg, _("free_space error on %s. The current medium is probably not writable. ERR=%s.\n"),
- dev->dev_name, dev->errmsg);
+ Mmsg2(jcr->errmsg, _("free_space error on %s. The current medium is probably not writable: ERR=%s.\n"),
+ dev->print_name(), dev->errmsg);
return VOL_NO_MEDIA;
}
/* If we can't guess the name, and we are writing, just reopen the right file with open_first_part. */
if (open_first_part(dev) < 0) {
berrno be;
- Mmsg2(jcr->errmsg, _("open_first_part error on %s. ERR=%s.\n"),
- dev->dev_name, be.strerror());
+ Mmsg2(jcr->errmsg, _("open_first_part error on %s: ERR=%s.\n"),
+ dev->print_name(), be.strerror());
Dmsg0(100, "Leave read_dev_volume_label_guess VOL_IO_ERROR (!open_guess_name_dev && !open_first_part)\n");
return VOL_IO_ERROR;
}
} else {
if (write && dcr->dev->free_space_errno < 0) {
Dmsg0(100, "Leave read_dev_volume_label_guess !free_space VOL_NO_MEDIA\n");
- Mmsg2(jcr->errmsg, _("free_space error on %s. The current medium is probably not writable. ERR=%s.\n"),
- dev->dev_name, dev->errmsg);
+ Mmsg2(jcr->errmsg, _("free_space error on %s. The current medium is probably not writable: ERR=%s.\n"),
+ dev->print_name(), dev->errmsg);
return VOL_NO_MEDIA;
}
if (open_first_part(dcr->dev) < 0) {
berrno be;
- Mmsg2(jcr->errmsg, _("open_first_part error on %s. ERR=%s.\n"),
- dev->dev_name, be.strerror());
+ Mmsg2(jcr->errmsg, _("open_first_part error on %s: ERR=%s.\n"),
+ dev->print_name(), be.strerror());
Dmsg0(100, "Leave read_dev_volume_label_guess VOL_IO_ERROR (open_guess_name_dev && !open_first_part)\n");
return VOL_IO_ERROR;
}
*/
if (vol_label_status != VOL_NAME_ERROR) {
Dmsg0(100, "Leave read_dev_volume_label_guess (open_guess_name_dev && !VOL_NAME_ERROR)\n");
- dev->state &= ~ST_LABEL;
+ dev->clear_label();
return read_dev_volume_label(dcr);
} else {
Dmsg0(100, "Leave read_dev_volume_label_guess (open_guess_name_dev && VOL_NAME_ERROR)\n");
Dmsg1(100, "Label type=%d\n", dev->label_type);
if (!rewind_dev(dev)) {
memset(&dev->VolHdr, 0, sizeof(dev->VolHdr));
- Dmsg2(30, "Bad status on %s from rewind. ERR=%s\n", dev->archive_name(), strerror_dev(dev));
+ Dmsg2(30, "Bad status on %s from rewind: ERR=%s\n", dev->print_name(), strerror_dev(dev));
if (!forge_on) {
goto bail_out;
}
dcr->rec->Stream = 0;
/* Temporarily mark in append state to enable writing */
- dev->state |= ST_APPEND;
+ dev->set_append();
if (!write_record_to_block(dcr->block, dcr->rec)) {
- Dmsg2(30, "Bad Label write on %s. ERR=%s\n", dev->archive_name(), strerror_dev(dev));
+ Dmsg2(30, "Bad Label write on %s: ERR=%s\n", dev->print_name(), strerror_dev(dev));
goto bail_out;
} else {
- Dmsg2(30, "Wrote label of %d bytes to %s\n", dcr->rec->data_len, dev->archive_name());
+ Dmsg2(30, "Wrote label of %d bytes to %s\n", dcr->rec->data_len, dev->print_name());
}
Dmsg0(99, "Call write_block_to_dev()\n");
if (!write_block_to_dev(dcr)) {
- Dmsg2(30, "Bad Label write on %s. ERR=%s\n", dev->archive_name(), strerror_dev(dev));
+ Dmsg2(30, "Bad Label write on %s: ERR=%s\n", dev->print_name(), strerror_dev(dev));
goto bail_out;
}
Dmsg0(99, " Wrote block to device\n");
if (weof_dev(dev, 1) == 0) {
- dev->state |= ST_LABEL;
+ dev->set_label();
write_ansi_ibm_labels(dcr, ANSI_EOF_LABEL, dev->VolHdr.VolName);
}
if (debug_level >= 20) {
dump_volume_label(dev);
}
- dev->state &= ~ST_APPEND; /* remove append since this is PRE_LABEL */
+ dev->clear_append(); /* remove append since this is PRE_LABEL */
return true;
bail_out:
memset(&dev->VolHdr, 0, sizeof(dev->VolHdr));
- dev->state &= ~ST_APPEND; /* remove append since this is PRE_LABEL */
+ dev->clear_append(); /* remove append since this is PRE_LABEL */
return false;
}
Dmsg1(190, "set append found freshly labeled volume. dev=%x\n", dev);
dev->VolHdr.LabelType = VOL_LABEL; /* set Volume label */
- dev->state |= ST_APPEND;
+ dev->set_append();
if (!write_volume_label_to_block(dcr)) {
Dmsg0(200, "Error from write volume label.\n");
return false;
bstrncpy(dev->VolHdr.LabelProg, my_name, sizeof(dev->VolHdr.LabelProg));
sprintf(dev->VolHdr.ProgVersion, "Ver. %s %s", VERSION, BDATE);
sprintf(dev->VolHdr.ProgDate, "Build %s %s", __DATE__, __TIME__);
- dev->state |= ST_LABEL; /* set has Bacula label */
+ dev->set_label(); /* set has Bacula label */
if (debug_level >= 90) {
dump_volume_label(dev);
}
bool commit_attribute_spool (JCR *jcr);
bool write_block_to_spool_file (DCR *dcr);
void list_spool_stats (BSOCK *bs);
+
+/* From wait.c */
+int wait_for_sysop(DCR *dcr);
/* */
#undef VERSION
#define VERSION "1.37.8"
-#define BDATE "18 March 2005"
-#define LSMDATE "18Mar05"
+#define BDATE "22 March 2005"
+#define LSMDATE "22Mar05"
/* Debug flags */
#undef DEBUG