From: Kern Sibbald Date: Wed, 24 Sep 2008 16:13:15 +0000 (+0000) Subject: This code should fix the race condition that leads to a Director X-Git-Tag: Release-2.4.3~12 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=d50b8f4134e7874658e57ba496502a08e7b912af;p=bacula%2Fbacula This code should fix the race condition that leads to a Director crash at job end time when the job list is updated. This was reported in bug #1162. git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/branches/Branch-2.4@7628 91ce42f0-d328-0410-95d8-f526ca767f89 --- diff --git a/bacula/patches/2.4.2-jobend-crash.patch b/bacula/patches/2.4.2-jobend-crash.patch new file mode 100644 index 0000000000..9cac2aea4e --- /dev/null +++ b/bacula/patches/2.4.2-jobend-crash.patch @@ -0,0 +1,119 @@ + + This patch should fix the race condition that leads to a Director + crash at job end time when the job list is updated. This was reported + in bug #1162. + + Apply this patch to Bacula version 2.4.2 (and earlier) with: + + cd + patch -p0 <2.4.2-jobend-crash.patch + ./configure + make + ... + make install + + +Index: src/lib/jcr.c +=================================================================== +--- src/lib/jcr.c (revision 7566) ++++ src/lib/jcr.c (working copy) +@@ -110,6 +110,7 @@ + void term_last_jobs_list() + { + if (last_jobs) { ++ lock_last_jobs_list(); + while (!last_jobs->empty()) { + void *je = last_jobs->first(); + last_jobs->remove(je); +@@ -117,6 +118,7 @@ + } + delete last_jobs; + last_jobs = NULL; ++ unlock_last_jobs_list(); + } + if (jcrs) { + delete jcrs; +@@ -128,6 +130,7 @@ + { + struct s_last_job *je, job; + uint32_t num; ++ bool ok = true; + + Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr); + if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) { +@@ -140,11 +143,13 @@ + if (num > 4 * max_last_jobs) { /* sanity check */ + return false; + } ++ lock_last)jobs_list(); + for ( ; num; num--) { + if (read(fd, &job, sizeof(job)) != sizeof(job)) { + berrno be; + Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror()); +- return false; ++ ok = false; ++ break; + } + if (job.JobId > 0) { + je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); +@@ -160,7 +165,8 @@ + } + } + } +- return true; ++ unlock_last_jobs_list(); ++ return ok; + } + + uint64_t write_last_jobs_list(int fd, uint64_t addr) +@@ -173,20 +179,22 @@ + return 0; + } + if (last_jobs) { ++ lock_last)jobs_list(); + /* First record is number of entires */ + num = last_jobs->size(); + if (write(fd, &num, sizeof(num)) != sizeof(num)) { + berrno be; + Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror()); +- return 0; ++ goto bail_out; + } + foreach_dlist(je, last_jobs) { + if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) { + berrno be; + Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror()); +- return 0; ++ got bail_out; + } + } ++ unlock_last_jobs_list(); + } + /* Return current address */ + ssize_t stat = lseek(fd, 0, SEEK_CUR); +@@ -195,6 +203,9 @@ + } + return stat; + ++bail_out: ++ unlock_last_jobs_list(); ++ return 0; + } + + void lock_last_jobs_list() +@@ -331,6 +342,7 @@ + last_job.end_time = time(NULL); + /* Keep list of last jobs, but not Console where JobId==0 */ + if (last_job.JobId > 0) { ++ lock_last_jobs_list(); + je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); + memcpy((char *)je, (char *)&last_job, sizeof(last_job)); + if (!last_jobs) { +@@ -342,6 +354,7 @@ + last_jobs->remove(je); + free(je); + } ++ unlock_last_jobs_list(); + } + break; + default: diff --git a/bacula/src/lib/jcr.c b/bacula/src/lib/jcr.c index cd406150e4..d68701436d 100644 --- a/bacula/src/lib/jcr.c +++ b/bacula/src/lib/jcr.c @@ -110,6 +110,7 @@ void init_last_jobs_list() void term_last_jobs_list() { if (last_jobs) { + lock_last_jobs_list(); while (!last_jobs->empty()) { void *je = last_jobs->first(); last_jobs->remove(je); @@ -117,6 +118,7 @@ void term_last_jobs_list() } delete last_jobs; last_jobs = NULL; + unlock_last_jobs_list(); } if (jcrs) { delete jcrs; @@ -128,6 +130,7 @@ bool read_last_jobs_list(int fd, uint64_t addr) { struct s_last_job *je, job; uint32_t num; + bool ok = true; Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr); if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) { @@ -140,11 +143,13 @@ bool read_last_jobs_list(int fd, uint64_t addr) if (num > 4 * max_last_jobs) { /* sanity check */ return false; } + lock_last)jobs_list(); for ( ; num; num--) { if (read(fd, &job, sizeof(job)) != sizeof(job)) { berrno be; Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror()); - return false; + ok = false; + break; } if (job.JobId > 0) { je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); @@ -160,7 +165,8 @@ bool read_last_jobs_list(int fd, uint64_t addr) } } } - return true; + unlock_last_jobs_list(); + return ok; } uint64_t write_last_jobs_list(int fd, uint64_t addr) @@ -173,20 +179,22 @@ uint64_t write_last_jobs_list(int fd, uint64_t addr) return 0; } if (last_jobs) { + lock_last)jobs_list(); /* First record is number of entires */ num = last_jobs->size(); if (write(fd, &num, sizeof(num)) != sizeof(num)) { berrno be; Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror()); - return 0; + goto bail_out; } foreach_dlist(je, last_jobs) { if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) { berrno be; Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror()); - return 0; + got bail_out; } } + unlock_last_jobs_list(); } /* Return current address */ ssize_t stat = lseek(fd, 0, SEEK_CUR); @@ -195,6 +203,9 @@ uint64_t write_last_jobs_list(int fd, uint64_t addr) } return stat; +bail_out: + unlock_last_jobs_list(); + return 0; } void lock_last_jobs_list() @@ -331,6 +342,7 @@ static void free_common_jcr(JCR *jcr) last_job.end_time = time(NULL); /* Keep list of last jobs, but not Console where JobId==0 */ if (last_job.JobId > 0) { + lock_last_jobs_list(); je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); memcpy((char *)je, (char *)&last_job, sizeof(last_job)); if (!last_jobs) { @@ -342,6 +354,7 @@ static void free_common_jcr(JCR *jcr) last_jobs->remove(je); free(je); } + unlock_last_jobs_list(); } break; default: diff --git a/bacula/technotes-2.4 b/bacula/technotes-2.4 index 65809202a6..cf1cfb9c8a 100644 --- a/bacula/technotes-2.4 +++ b/bacula/technotes-2.4 @@ -1,6 +1,10 @@ Technical notes on version 2.4 General: +28Sep08 +kes This code should fix the race condition that leads to a Director + crash at job end time when the job list is updated. This was reported + in bug #1162. 20Sep08 kes Remove all double quotes from SQLite creating script and replace by single quotes as suggested by John Huttley.