From da6919384ac7d556c742067e8fefea326710ccdf Mon Sep 17 00:00:00 2001 From: Kern Sibbald Date: Thu, 3 Jun 2010 14:20:03 +0200 Subject: [PATCH] Fix bug #1582 Restore from multiple storage daemons breaks subsequent backups --- bacula/src/dird/jobq.c | 35 +++++++++++++++++++++++--------- bacula/src/dird/protos.h | 4 ++++ bacula/src/dird/restore.c | 42 +++++++++++++++++++++++++++++++++++---- 3 files changed, 68 insertions(+), 13 deletions(-) diff --git a/bacula/src/dird/jobq.c b/bacula/src/dird/jobq.c index c324d45111..68febb4075 100644 --- a/bacula/src/dird/jobq.c +++ b/bacula/src/dird/jobq.c @@ -1,7 +1,7 @@ /* Bacula® - The Network Backup Solution - Copyright (C) 2003-2009 Free Software Foundation Europe e.V. + Copyright (C) 2003-2010 Free Software Foundation Europe e.V. The main author of Bacula is Kern Sibbald, with contributions from many others, a complete list can be found in the file AUTHORS. @@ -56,7 +56,6 @@ extern "C" void *sched_wait(void *arg); static int start_server(jobq_t *jq); static bool acquire_resources(JCR *jcr); static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je); -static void dec_read_store(JCR *jcr); static void dec_write_store(JCR *jcr); /* @@ -710,7 +709,7 @@ static bool acquire_resources(JCR *jcr) * but we do not really have enough information here to * know if this is really a deadlock (it may be a dual drive * autochanger), and in principle, the SD reservation system - * should detect these deadlocks, so push the work off on is. + * should detect these deadlocks, so push the work off on it. */ #ifdef xxx if (jcr->rstore && jcr->rstore == jcr->wstore) { /* possible deadlock */ @@ -723,11 +722,7 @@ static bool acquire_resources(JCR *jcr) #endif if (jcr->rstore) { Dmsg1(200, "Rstore=%s\n", jcr->rstore->name()); - if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { - jcr->rstore->NumConcurrentReadJobs++; - jcr->rstore->NumConcurrentJobs++; - Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs); - } else { + if (!inc_read_store(jcr)) { Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs); set_jcr_job_status(jcr, JS_WaitStoreRes); return false; @@ -776,12 +771,34 @@ static bool acquire_resources(JCR *jcr) return true; } -static void dec_read_store(JCR *jcr) +static pthread_mutex_t rstore_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* + * Note: inc_read_store() and dec_read_store() are + * called from select_rstore() in src/dird/restore.c + */ +bool inc_read_store(JCR *jcr) +{ + P(rstore_mutex); + if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) { + jcr->rstore->NumConcurrentReadJobs++; + jcr->rstore->NumConcurrentJobs++; + Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs); + V(rstore_mutex); + return true; + } + V(rstore_mutex); + return false; +} + +void dec_read_store(JCR *jcr) { if (jcr->rstore) { + P(rstore_mutex); jcr->rstore->NumConcurrentReadJobs--; /* back out rstore */ jcr->rstore->NumConcurrentJobs--; /* back out rstore */ Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs); + V(rstore_mutex); ASSERT(jcr->rstore->NumConcurrentReadJobs >= 0); ASSERT(jcr->rstore->NumConcurrentJobs >= 0); } diff --git a/bacula/src/dird/protos.h b/bacula/src/dird/protos.h index 25882479ac..b7db3006a3 100644 --- a/bacula/src/dird/protos.h +++ b/bacula/src/dird/protos.h @@ -143,6 +143,10 @@ extern void cancel_storage_daemon_job(JCR *jcr); extern bool run_console_command(JCR *jcr, const char *cmd); extern void sd_msg_thread_send_signal(JCR *jcr, int sig); +/* jobq.c */ +extern bool inc_read_store(JCR *jcr); +extern void dec_read_store(JCR *jcr); + /* migration.c */ extern bool do_migration(JCR *jcr); extern bool do_migration_init(JCR *jcr); diff --git a/bacula/src/dird/restore.c b/bacula/src/dird/restore.c index 098dd4d7dc..e10487dd54 100644 --- a/bacula/src/dird/restore.c +++ b/bacula/src/dird/restore.c @@ -257,31 +257,63 @@ static bool send_bootstrap_file(JCR *jcr, BSOCK *sock, return true; } +#define MAX_TRIES 6 * 360 /* 6 hours */ + /** * Change the read storage resource for the current job. */ -static void select_rstore(JCR *jcr, bootstrap_info &info) +static bool select_rstore(JCR *jcr, bootstrap_info &info) { USTORE ustore; + int i; + if (!strcmp(jcr->rstore->name(), info.storage)) { - return; + return true; /* same SD nothing to change */ } if (!(ustore.store = (STORE *)GetResWithName(R_STORAGE,info.storage))) { Jmsg(jcr, M_FATAL, 0, _("Could not get storage resource '%s'.\n"), info.storage); set_jcr_job_status(jcr, JS_ErrorTerminated); - return; + return false; } + /* + * What does this do??????????? KES + */ if (jcr->store_bsock) { jcr->store_bsock->destroy(); jcr->store_bsock = NULL; } + /* + * release current read storage and get a new one + */ + dec_read_store(jcr); free_rstorage(jcr); set_rstorage(jcr, &ustore); + set_jcr_job_status(jcr, JS_WaitSD); + /* + * Wait for up to 6 hours to increment read stoage counter + */ + for (i=0; i < MAX_TRIES; i++) { + /* try to get read storage counter incremented */ + if (inc_read_store(jcr)) { + set_jcr_job_status(jcr, JS_Running); + return true; + } + bmicrosleep(10, 0); /* sleep 10 secs */ + if (job_canceled(jcr)) { + free_rstorage(jcr); + return false; + } + } + /* Failed to inc_read_store() */ + free_rstorage(jcr); + Jmsg(jcr, M_FATAL, 0, + _("Could not acquire read storage lock for \"%s\""), info.storage); + return false; } /* @@ -325,7 +357,9 @@ bool restore_bootstrap(JCR *jcr) /* Read the bootstrap file */ while (!feof(info.bs)) { - select_rstore(jcr, info); + if (!select_rstore(jcr, info)) { + goto bail_out; + } /** * Open a message channel connection with the Storage -- 2.39.5