--- /dev/null
+
+ This patch should fix the race condition that leads to a Director
+ crash at job end time when the job list is updated. This was reported
+ in bug #1162.
+
+ Apply this patch to Bacula version 2.4.2 (and earlier) with:
+
+ cd <bacula-source>
+ patch -p0 <2.4.2-jobend-crash.patch
+ ./configure <your-options>
+ make
+ ...
+ make install
+
+
+Index: src/lib/jcr.c
+===================================================================
+--- src/lib/jcr.c (revision 7627)
++++ src/lib/jcr.c (working copy)
+@@ -110,6 +110,7 @@
+ void term_last_jobs_list()
+ {
+ if (last_jobs) {
++ lock_last_jobs_list();
+ while (!last_jobs->empty()) {
+ void *je = last_jobs->first();
+ last_jobs->remove(je);
+@@ -117,6 +118,7 @@
+ }
+ delete last_jobs;
+ last_jobs = NULL;
++ unlock_last_jobs_list();
+ }
+ if (jcrs) {
+ delete jcrs;
+@@ -128,6 +130,7 @@
+ {
+ struct s_last_job *je, job;
+ uint32_t num;
++ bool ok = true;
+
+ Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr);
+ if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) {
+@@ -140,11 +143,13 @@
+ if (num > 4 * max_last_jobs) { /* sanity check */
+ return false;
+ }
++ lock_last_jobs_list();
+ for ( ; num; num--) {
+ if (read(fd, &job, sizeof(job)) != sizeof(job)) {
+ berrno be;
+ Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror());
+- return false;
++ ok = false;
++ break;
+ }
+ if (job.JobId > 0) {
+ je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
+@@ -160,41 +165,48 @@
+ }
+ }
+ }
+- return true;
++ unlock_last_jobs_list();
++ return ok;
+ }
+
+ uint64_t write_last_jobs_list(int fd, uint64_t addr)
+ {
+ struct s_last_job *je;
+ uint32_t num;
++ ssize_t stat;
+
+ Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr);
+ if (lseek(fd, (off_t)addr, SEEK_SET) < 0) {
+ return 0;
+ }
+ if (last_jobs) {
++ lock_last_jobs_list();
+ /* First record is number of entires */
+ num = last_jobs->size();
+ if (write(fd, &num, sizeof(num)) != sizeof(num)) {
+ berrno be;
+ Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror());
+- return 0;
++ goto bail_out;
+ }
+ foreach_dlist(je, last_jobs) {
+ if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) {
+ berrno be;
+ Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror());
+- return 0;
++ goto bail_out;
+ }
+ }
++ unlock_last_jobs_list();
+ }
+ /* Return current address */
+- ssize_t stat = lseek(fd, 0, SEEK_CUR);
++ stat = lseek(fd, 0, SEEK_CUR);
+ if (stat < 0) {
+ stat = 0;
+ }
+ return stat;
+
++bail_out:
++ unlock_last_jobs_list();
++ return 0;
+ }
+
+ void lock_last_jobs_list()
+@@ -331,6 +343,7 @@
+ last_job.end_time = time(NULL);
+ /* Keep list of last jobs, but not Console where JobId==0 */
+ if (last_job.JobId > 0) {
++ lock_last_jobs_list();
+ je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
+ memcpy((char *)je, (char *)&last_job, sizeof(last_job));
+ if (!last_jobs) {
+@@ -342,6 +355,7 @@
+ last_jobs->remove(je);
+ free(je);
+ }
++ unlock_last_jobs_list();
+ }
+ break;
+ default:
goto bail_out;
}
pool_bytes -= ctx.value;
- Dmsg1(dbglevel, "Total migrate Job bytes=%s\n", edit_int64(ctx.value, ed1));
+ Dmsg1(dbglevel, "Total migrate Job bytes=%s\n", edit_int64_with_commas(ctx.value, ed1));
Dmsg2(dbglevel, "lowbytes=%s poolafter=%s\n",
- edit_int64(jcr->rpool->MigrationLowBytes, ed1),
- edit_int64(pool_bytes, ed2));
+ edit_int64_with_commas(jcr->rpool->MigrationLowBytes, ed1),
+ edit_int64_with_commas(pool_bytes, ed2));
if (pool_bytes <= (int64_t)jcr->rpool->MigrationLowBytes) {
Dmsg0(dbglevel, "We should be done.\n");
break;
*/
char *edit_uint64_with_commas(uint64_t val, char *buf)
{
- /*
- * Replacement for sprintf(buf, "%" llu, val)
- */
- char mbuf[50];
- mbuf[sizeof(mbuf)-1] = 0;
- int i = sizeof(mbuf)-2; /* edit backward */
- if (val == 0) {
- mbuf[i--] = '0';
- } else {
- while (val != 0) {
- mbuf[i--] = "0123456789"[val%10];
- val /= 10;
- }
- }
- bstrncpy(buf, &mbuf[i+1], 27);
+ edit_uint64(val, buf);
return add_commas(buf, buf);
}
return buf;
}
+/*
+ * Edit an integer number with commas, the supplied buffer
+ * must be at least 27 bytes long. The incoming number
+ * is always widened to 64 bits.
+ */
+char *edit_int64_with_commas(int64_t val, char *buf)
+{
+ edit_int64(val, buf);
+ return add_commas(buf, buf);
+}
/*
* Given a string "str", separate the numeric part into
void term_last_jobs_list()
{
if (last_jobs) {
+ lock_last_jobs_list();
while (!last_jobs->empty()) {
void *je = last_jobs->first();
last_jobs->remove(je);
}
delete last_jobs;
last_jobs = NULL;
+ unlock_last_jobs_list();
}
if (jcrs) {
delete jcrs;
{
struct s_last_job *je, job;
uint32_t num;
+ bool ok = true;
Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr);
if (addr == 0 || lseek(fd, (boffset_t)addr, SEEK_SET) < 0) {
if (num > 4 * max_last_jobs) { /* sanity check */
return false;
}
+ lock_last_jobs_list();
for ( ; num; num--) {
if (read(fd, &job, sizeof(job)) != sizeof(job)) {
berrno be;
Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror());
- return false;
+ ok = false;
+ break;
}
if (job.JobId > 0) {
je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
}
}
}
- return true;
+ unlock_last_jobs_list();
+ return ok;
}
uint64_t write_last_jobs_list(int fd, uint64_t addr)
{
struct s_last_job *je;
uint32_t num;
+ ssize_t stat;
Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr);
if (lseek(fd, (boffset_t)addr, SEEK_SET) < 0) {
return 0;
}
if (last_jobs) {
+ lock_last_jobs_list();
/* First record is number of entires */
num = last_jobs->size();
if (write(fd, &num, sizeof(num)) != sizeof(num)) {
berrno be;
Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror());
- return 0;
+ goto bail_out;
}
foreach_dlist(je, last_jobs) {
if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) {
berrno be;
Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror());
- return 0;
+ goto bail_out;
}
}
+ unlock_last_jobs_list();
}
/* Return current address */
- ssize_t stat = lseek(fd, 0, SEEK_CUR);
+ stat = lseek(fd, 0, SEEK_CUR);
if (stat < 0) {
stat = 0;
}
return stat;
+bail_out:
+ unlock_last_jobs_list();
+ return 0;
}
void lock_last_jobs_list()
case JT_ADMIN:
/* Keep list of last jobs, but not Console where JobId==0 */
if (jcr->JobId > 0) {
+ lock_last_jobs_list();
num_jobs_run++;
je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
memset(je, 0, sizeof(struct s_last_job)); /* zero in case unset fields */
last_jobs->remove(je);
free(je);
}
+ unlock_last_jobs_list();
}
break;
default:
char * add_commas (char *val, char *buf);
char * edit_uint64 (uint64_t val, char *buf);
char * edit_int64 (int64_t val, char *buf);
+char * edit_int64_with_commas (int64_t val, char *buf);
bool duration_to_utime (char *str, utime_t *value);
bool size_to_uint64(char *str, int str_len, uint64_t *rtn_value);
char *edit_utime (utime_t val, char *buf, int buf_len);
General:
24Sep08
+kes This code should fix the race condition that leads to a Director
+ crash at job end time when the job list is updated. This was reported
+ in bug #1162.
kes Add more plugin documentation and fix include of config.h for
Win32 build (I hope).
22Sep08