From: Kern Sibbald Date: Wed, 24 Sep 2008 18:27:01 +0000 (+0000) Subject: This code should fix the race condition that leads to a Director X-Git-Tag: Release-7.0.0~4100 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=6bad32d26efa9ee654db2ac1f8ccd533ba2b3168;p=bacula%2Fbacula This code should fix the race condition that leads to a Director crash at job end time when the job list is updated. This was reported in bug #1162. + enhance Migrate debug code git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@7632 91ce42f0-d328-0410-95d8-f526ca767f89 --- diff --git a/bacula/patches/2.4.2-jobend-crash.patch b/bacula/patches/2.4.2-jobend-crash.patch new file mode 100644 index 0000000000..82b577d2f3 --- /dev/null +++ b/bacula/patches/2.4.2-jobend-crash.patch @@ -0,0 +1,127 @@ + + This patch should fix the race condition that leads to a Director + crash at job end time when the job list is updated. This was reported + in bug #1162. + + Apply this patch to Bacula version 2.4.2 (and earlier) with: + + cd + patch -p0 <2.4.2-jobend-crash.patch + ./configure + make + ... + make install + + +Index: src/lib/jcr.c +=================================================================== +--- src/lib/jcr.c (revision 7627) ++++ src/lib/jcr.c (working copy) +@@ -110,6 +110,7 @@ + void term_last_jobs_list() + { + if (last_jobs) { ++ lock_last_jobs_list(); + while (!last_jobs->empty()) { + void *je = last_jobs->first(); + last_jobs->remove(je); +@@ -117,6 +118,7 @@ + } + delete last_jobs; + last_jobs = NULL; ++ unlock_last_jobs_list(); + } + if (jcrs) { + delete jcrs; +@@ -128,6 +130,7 @@ + { + struct s_last_job *je, job; + uint32_t num; ++ bool ok = true; + + Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr); + if (addr == 0 || lseek(fd, (off_t)addr, SEEK_SET) < 0) { +@@ -140,11 +143,13 @@ + if (num > 4 * max_last_jobs) { /* sanity check */ + return false; + } ++ lock_last_jobs_list(); + for ( ; num; num--) { + if (read(fd, &job, sizeof(job)) != sizeof(job)) { + berrno be; + Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror()); +- return false; ++ ok = false; ++ break; + } + if (job.JobId > 0) { + je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); +@@ -160,41 +165,48 @@ + } + } + } +- return true; ++ unlock_last_jobs_list(); ++ return ok; + } + + uint64_t write_last_jobs_list(int fd, uint64_t addr) + { + struct s_last_job *je; + uint32_t num; ++ ssize_t stat; + + Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr); + if (lseek(fd, (off_t)addr, SEEK_SET) < 0) { + return 0; + } + if (last_jobs) { ++ lock_last_jobs_list(); + /* First record is number of entires */ + num = last_jobs->size(); + if (write(fd, &num, sizeof(num)) != sizeof(num)) { + berrno be; + Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror()); +- return 0; ++ goto bail_out; + } + foreach_dlist(je, last_jobs) { + if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) { + berrno be; + Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror()); +- return 0; ++ goto bail_out; + } + } ++ unlock_last_jobs_list(); + } + /* Return current address */ +- ssize_t stat = lseek(fd, 0, SEEK_CUR); ++ stat = lseek(fd, 0, SEEK_CUR); + if (stat < 0) { + stat = 0; + } + return stat; + ++bail_out: ++ unlock_last_jobs_list(); ++ return 0; + } + + void lock_last_jobs_list() +@@ -331,6 +343,7 @@ + last_job.end_time = time(NULL); + /* Keep list of last jobs, but not Console where JobId==0 */ + if (last_job.JobId > 0) { ++ lock_last_jobs_list(); + je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); + memcpy((char *)je, (char *)&last_job, sizeof(last_job)); + if (!last_jobs) { +@@ -342,6 +355,7 @@ + last_jobs->remove(je); + free(je); + } ++ unlock_last_jobs_list(); + } + break; + default: diff --git a/bacula/src/dird/migrate.c b/bacula/src/dird/migrate.c index edc24b1f56..aca06de11e 100644 --- a/bacula/src/dird/migrate.c +++ b/bacula/src/dird/migrate.c @@ -753,10 +753,10 @@ static int get_job_to_migrate(JCR *jcr) goto bail_out; } pool_bytes -= ctx.value; - Dmsg1(dbglevel, "Total migrate Job bytes=%s\n", edit_int64(ctx.value, ed1)); + Dmsg1(dbglevel, "Total migrate Job bytes=%s\n", edit_int64_with_commas(ctx.value, ed1)); Dmsg2(dbglevel, "lowbytes=%s poolafter=%s\n", - edit_int64(jcr->rpool->MigrationLowBytes, ed1), - edit_int64(pool_bytes, ed2)); + edit_int64_with_commas(jcr->rpool->MigrationLowBytes, ed1), + edit_int64_with_commas(pool_bytes, ed2)); if (pool_bytes <= (int64_t)jcr->rpool->MigrationLowBytes) { Dmsg0(dbglevel, "We should be done.\n"); break; diff --git a/bacula/src/lib/edit.c b/bacula/src/lib/edit.c index ee435affb1..480253920f 100644 --- a/bacula/src/lib/edit.c +++ b/bacula/src/lib/edit.c @@ -91,21 +91,7 @@ int64_t str_to_int64(char *str) */ char *edit_uint64_with_commas(uint64_t val, char *buf) { - /* - * Replacement for sprintf(buf, "%" llu, val) - */ - char mbuf[50]; - mbuf[sizeof(mbuf)-1] = 0; - int i = sizeof(mbuf)-2; /* edit backward */ - if (val == 0) { - mbuf[i--] = '0'; - } else { - while (val != 0) { - mbuf[i--] = "0123456789"[val%10]; - val /= 10; - } - } - bstrncpy(buf, &mbuf[i+1], 27); + edit_uint64(val, buf); return add_commas(buf, buf); } @@ -195,6 +181,16 @@ char *edit_int64(int64_t val, char *buf) return buf; } +/* + * Edit an integer number with commas, the supplied buffer + * must be at least 27 bytes long. The incoming number + * is always widened to 64 bits. + */ +char *edit_int64_with_commas(int64_t val, char *buf) +{ + edit_int64(val, buf); + return add_commas(buf, buf); +} /* * Given a string "str", separate the numeric part into diff --git a/bacula/src/lib/jcr.c b/bacula/src/lib/jcr.c index 057df53f50..7b2231ded0 100644 --- a/bacula/src/lib/jcr.c +++ b/bacula/src/lib/jcr.c @@ -124,6 +124,7 @@ void init_last_jobs_list() void term_last_jobs_list() { if (last_jobs) { + lock_last_jobs_list(); while (!last_jobs->empty()) { void *je = last_jobs->first(); last_jobs->remove(je); @@ -131,6 +132,7 @@ void term_last_jobs_list() } delete last_jobs; last_jobs = NULL; + unlock_last_jobs_list(); } if (jcrs) { delete jcrs; @@ -142,6 +144,7 @@ bool read_last_jobs_list(int fd, uint64_t addr) { struct s_last_job *je, job; uint32_t num; + bool ok = true; Dmsg1(100, "read_last_jobs seek to %d\n", (int)addr); if (addr == 0 || lseek(fd, (boffset_t)addr, SEEK_SET) < 0) { @@ -154,11 +157,13 @@ bool read_last_jobs_list(int fd, uint64_t addr) if (num > 4 * max_last_jobs) { /* sanity check */ return false; } + lock_last_jobs_list(); for ( ; num; num--) { if (read(fd, &job, sizeof(job)) != sizeof(job)) { berrno be; Pmsg1(000, "Read job entry. ERR=%s\n", be.bstrerror()); - return false; + ok = false; + break; } if (job.JobId > 0) { je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); @@ -174,41 +179,48 @@ bool read_last_jobs_list(int fd, uint64_t addr) } } } - return true; + unlock_last_jobs_list(); + return ok; } uint64_t write_last_jobs_list(int fd, uint64_t addr) { struct s_last_job *je; uint32_t num; + ssize_t stat; Dmsg1(100, "write_last_jobs seek to %d\n", (int)addr); if (lseek(fd, (boffset_t)addr, SEEK_SET) < 0) { return 0; } if (last_jobs) { + lock_last_jobs_list(); /* First record is number of entires */ num = last_jobs->size(); if (write(fd, &num, sizeof(num)) != sizeof(num)) { berrno be; Pmsg1(000, "Error writing num_items: ERR=%s\n", be.bstrerror()); - return 0; + goto bail_out; } foreach_dlist(je, last_jobs) { if (write(fd, je, sizeof(struct s_last_job)) != sizeof(struct s_last_job)) { berrno be; Pmsg1(000, "Error writing job: ERR=%s\n", be.bstrerror()); - return 0; + goto bail_out; } } + unlock_last_jobs_list(); } /* Return current address */ - ssize_t stat = lseek(fd, 0, SEEK_CUR); + stat = lseek(fd, 0, SEEK_CUR); if (stat < 0) { stat = 0; } return stat; +bail_out: + unlock_last_jobs_list(); + return 0; } void lock_last_jobs_list() @@ -486,6 +498,7 @@ void free_jcr(JCR *jcr) case JT_ADMIN: /* Keep list of last jobs, but not Console where JobId==0 */ if (jcr->JobId > 0) { + lock_last_jobs_list(); num_jobs_run++; je = (struct s_last_job *)malloc(sizeof(struct s_last_job)); memset(je, 0, sizeof(struct s_last_job)); /* zero in case unset fields */ @@ -511,6 +524,7 @@ void free_jcr(JCR *jcr) last_jobs->remove(je); free(je); } + unlock_last_jobs_list(); } break; default: diff --git a/bacula/src/lib/protos.h b/bacula/src/lib/protos.h index 3429779c68..804a15068e 100644 --- a/bacula/src/lib/protos.h +++ b/bacula/src/lib/protos.h @@ -181,6 +181,7 @@ char * edit_uint64_with_suffix (uint64_t val, char *buf); char * add_commas (char *val, char *buf); char * edit_uint64 (uint64_t val, char *buf); char * edit_int64 (int64_t val, char *buf); +char * edit_int64_with_commas (int64_t val, char *buf); bool duration_to_utime (char *str, utime_t *value); bool size_to_uint64(char *str, int str_len, uint64_t *rtn_value); char *edit_utime (utime_t val, char *buf, int buf_len); diff --git a/bacula/technotes-2.5 b/bacula/technotes-2.5 index 4c9ec3f11c..e1b3f659f8 100644 --- a/bacula/technotes-2.5 +++ b/bacula/technotes-2.5 @@ -19,6 +19,9 @@ remove reader/writer in FOPTS???? General: 24Sep08 +kes This code should fix the race condition that leads to a Director + crash at job end time when the job list is updated. This was reported + in bug #1162. kes Add more plugin documentation and fix include of config.h for Win32 build (I hope). 22Sep08