From 13383854a40f5698846fd1672068a4df4c849bff Mon Sep 17 00:00:00 2001 From: Kern Sibbald Date: Fri, 28 Sep 2007 22:01:16 +0000 Subject: [PATCH] es Fix FD->SD authorization failure, which was due to spurious wakeups from a pthread_cond_timedwait(). Simply check the predicate before continuing. This fixes bug #953 git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/branches/Branch-2.2@5686 91ce42f0-d328-0410-95d8-f526ca767f89 --- bacula/patches/2.2.4-sd-auth-fail.patch | 159 ++++++++++++++++++++++++ bacula/src/stored/job.c | 43 ++++--- bacula/technotes-2.1 | 4 + 3 files changed, 186 insertions(+), 20 deletions(-) create mode 100644 bacula/patches/2.2.4-sd-auth-fail.patch diff --git a/bacula/patches/2.2.4-sd-auth-fail.patch b/bacula/patches/2.2.4-sd-auth-fail.patch new file mode 100644 index 0000000000..2e506e7884 --- /dev/null +++ b/bacula/patches/2.2.4-sd-auth-fail.patch @@ -0,0 +1,159 @@ + + This patch applies to Bacula version 2.2.4 (possibly earlier 2.2.x versions) + and fixes a Storage daemon authentication problem with the FD. This fixes + bug #953. The patch also adds a bit of additional debug code and significantly + strengthens the SD session key. + + Apply it to 2.2.4 with: + + cd + patch -p0 <2.2.4-sd-auth-fail.patch + make + ... + make install + + +Index: src/stored/job.c +=================================================================== +--- src/stored/job.c (revision 5602) ++++ src/stored/job.c (working copy) +@@ -73,6 +73,7 @@ + { + int JobId; + char auth_key[100]; ++ char seed[100]; + BSOCK *dir = jcr->dir_bsock; + POOL_MEM job_name, client_name, job, fileset_name, fileset_md5; + int JobType, level, spool_attributes, no_attributes, spool_data; +@@ -91,7 +92,7 @@ + &write_part_after_job, &PreferMountedVols); + if (stat != 13) { + pm_strcpy(jcr->errmsg, dir->msg); +- bnet_fsend(dir, BAD_job, stat, jcr->errmsg); ++ dir->fsend(BAD_job, stat, jcr->errmsg); + Dmsg1(100, ">dird: %s", dir->msg); + set_jcr_job_status(jcr, JS_ErrorTerminated); + return false; +@@ -134,9 +135,10 @@ + /* + * Pass back an authorization key for the File daemon + */ +- make_session_key(auth_key, NULL, 1); +- bnet_fsend(dir, OKjob, jcr->VolSessionId, jcr->VolSessionTime, auth_key); +- Dmsg1(100, ">dird: %s", dir->msg); ++ bsnprintf(seed, sizeof(seed), "%p%d", jcr, JobId); ++ make_session_key(auth_key, seed, 1); ++ dir->fsend(OKjob, jcr->VolSessionId, jcr->VolSessionTime, auth_key); ++ Dmsg2(100, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg); + jcr->sd_auth_key = bstrdup(auth_key); + memset(auth_key, 0, sizeof(auth_key)); + generate_daemon_event(jcr, "JobStart"); +@@ -169,17 +171,18 @@ + timeout.tv_nsec = tv.tv_usec * 1000; + timeout.tv_sec = tv.tv_sec + me->client_wait; + +- Dmsg2(100, "%s waiting %d sec for FD to contact SD\n", +- jcr->Job, (int)me->client_wait); ++ Dmsg3(050, "%s waiting %d sec for FD to contact SD key=%s\n", ++ jcr->Job, (int)me->client_wait, jcr->sd_auth_key); ++ + /* + * Wait for the File daemon to contact us to start the Job, + * when he does, we will be released, unless the 30 minutes + * expires. + */ + P(mutex); +- for ( ; !job_canceled(jcr); ) { ++ while ( !jcr->authenticated && !job_canceled(jcr) ) { + errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout); +- if (errstat == 0 || errstat == ETIMEDOUT) { ++ if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) { + break; + } + } +@@ -195,7 +198,7 @@ + } + + /* +- * After receiving a connection (in job.c) if it is ++ * After receiving a connection (in dircmd.c) if it is + * from the File daemon, this routine is called. + */ + void handle_filed_connection(BSOCK *fd, char *job_name) +@@ -204,8 +207,8 @@ + + bmicrosleep(0, 50000); /* wait 50 millisecs */ + if (!(jcr=get_jcr_by_full_name(job_name))) { +- Jmsg1(NULL, M_FATAL, 0, _("Job name not found: %s\n"), job_name); +- Dmsg1(100, "Job name not found: %s\n", job_name); ++ Jmsg1(NULL, M_FATAL, 0, _("FD connect failed: Job name not found: %s\n"), job_name); ++ Dmsg1(3, "**** Job \"%s\" not found", job_name); + return; + } + +@@ -216,7 +219,7 @@ + + if (jcr->authenticated) { + Jmsg2(jcr, M_FATAL, 0, _("Hey!!!! JobId %u Job %s already authenticated.\n"), +- jcr->JobId, jcr->Job); ++ (uint32_t)jcr->JobId, jcr->Job); + free_jcr(jcr); + return; + } +@@ -229,7 +232,7 @@ + Jmsg(jcr, M_FATAL, 0, _("Unable to authenticate File daemon\n")); + } else { + jcr->authenticated = true; +- Dmsg1(110, "OK Authentication Job %s\n", jcr->Job); ++ Dmsg2(110, "OK Authentication jid=%u Job %s\n", (uint32_t)jcr->JobId, jcr->Job); + } + + if (!jcr->authenticated) { +@@ -274,9 +277,9 @@ + } + ok = dir_update_device(jcr, device->dev); + if (ok) { +- ok = bnet_fsend(dir, OK_query); ++ ok = dir->fsend(OK_query); + } else { +- bnet_fsend(dir, NO_query); ++ dir->fsend(NO_query); + } + return ok; + } +@@ -289,9 +292,9 @@ + } + ok = dir_update_changer(jcr, changer); + if (ok) { +- ok = bnet_fsend(dir, OK_query); ++ ok = dir->fsend(OK_query); + } else { +- bnet_fsend(dir, NO_query); ++ dir->fsend(NO_query); + } + return ok; + } +@@ -299,12 +302,12 @@ + /* If we get here, the device/autochanger was not found */ + unbash_spaces(dir->msg); + pm_strcpy(jcr->errmsg, dir->msg); +- bnet_fsend(dir, NO_device, dev_name.c_str()); ++ dir->fsend(NO_device, dev_name.c_str()); + Dmsg1(100, ">dird: %s\n", dir->msg); + } else { + unbash_spaces(dir->msg); + pm_strcpy(jcr->errmsg, dir->msg); +- bnet_fsend(dir, BAD_query, jcr->errmsg); ++ dir->fsend(BAD_query, jcr->errmsg); + Dmsg1(100, ">dird: %s\n", dir->msg); + } + +@@ -322,7 +325,7 @@ + { + Dmsg1(900, "stored_free_jcr JobId=%u\n", jcr->JobId); + if (jcr->file_bsock) { +- bnet_close(jcr->file_bsock); ++ jcr->file_bsock->close(); + jcr->file_bsock = NULL; + } + if (jcr->job_name) { diff --git a/bacula/src/stored/job.c b/bacula/src/stored/job.c index a0454fd97b..79b7aa58a5 100644 --- a/bacula/src/stored/job.c +++ b/bacula/src/stored/job.c @@ -73,6 +73,7 @@ bool job_cmd(JCR *jcr) { int JobId; char auth_key[100]; + char seed[100]; BSOCK *dir = jcr->dir_bsock; POOL_MEM job_name, client_name, job, fileset_name, fileset_md5; int JobType, level, spool_attributes, no_attributes, spool_data; @@ -91,7 +92,7 @@ bool job_cmd(JCR *jcr) &write_part_after_job, &PreferMountedVols); if (stat != 13) { pm_strcpy(jcr->errmsg, dir->msg); - bnet_fsend(dir, BAD_job, stat, jcr->errmsg); + dir->fsend(BAD_job, stat, jcr->errmsg); Dmsg1(100, ">dird: %s", dir->msg); set_jcr_job_status(jcr, JS_ErrorTerminated); return false; @@ -134,9 +135,10 @@ bool job_cmd(JCR *jcr) /* * Pass back an authorization key for the File daemon */ - make_session_key(auth_key, NULL, 1); - bnet_fsend(dir, OKjob, jcr->VolSessionId, jcr->VolSessionTime, auth_key); - Dmsg1(100, ">dird: %s", dir->msg); + bsnprintf(seed, sizeof(seed), "%p%d", jcr, JobId); + make_session_key(auth_key, seed, 1); + dir->fsend(OKjob, jcr->VolSessionId, jcr->VolSessionTime, auth_key); + Dmsg2(100, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg); jcr->sd_auth_key = bstrdup(auth_key); memset(auth_key, 0, sizeof(auth_key)); generate_daemon_event(jcr, "JobStart"); @@ -169,17 +171,18 @@ bool run_cmd(JCR *jcr) timeout.tv_nsec = tv.tv_usec * 1000; timeout.tv_sec = tv.tv_sec + me->client_wait; - Dmsg2(100, "%s waiting %d sec for FD to contact SD\n", - jcr->Job, (int)me->client_wait); + Dmsg3(050, "%s waiting %d sec for FD to contact SD key=%s\n", + jcr->Job, (int)me->client_wait, jcr->sd_auth_key); + /* * Wait for the File daemon to contact us to start the Job, * when he does, we will be released, unless the 30 minutes * expires. */ P(mutex); - for ( ; !job_canceled(jcr); ) { + while ( !jcr->authenticated && !job_canceled(jcr) ) { errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout); - if (errstat == 0 || errstat == ETIMEDOUT) { + if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) { break; } } @@ -195,7 +198,7 @@ bool run_cmd(JCR *jcr) } /* - * After receiving a connection (in job.c) if it is + * After receiving a connection (in dircmd.c) if it is * from the File daemon, this routine is called. */ void handle_filed_connection(BSOCK *fd, char *job_name) @@ -204,8 +207,8 @@ void handle_filed_connection(BSOCK *fd, char *job_name) bmicrosleep(0, 50000); /* wait 50 millisecs */ if (!(jcr=get_jcr_by_full_name(job_name))) { - Jmsg1(NULL, M_FATAL, 0, _("Job name not found: %s\n"), job_name); - Dmsg1(100, "Job name not found: %s\n", job_name); + Jmsg1(NULL, M_FATAL, 0, _("FD connect failed: Job name not found: %s\n"), job_name); + Dmsg1(3, "**** Job \"%s\" not found", job_name); return; } @@ -216,7 +219,7 @@ void handle_filed_connection(BSOCK *fd, char *job_name) if (jcr->authenticated) { Jmsg2(jcr, M_FATAL, 0, _("Hey!!!! JobId %u Job %s already authenticated.\n"), - jcr->JobId, jcr->Job); + (uint32_t)jcr->JobId, jcr->Job); free_jcr(jcr); return; } @@ -229,7 +232,7 @@ void handle_filed_connection(BSOCK *fd, char *job_name) Jmsg(jcr, M_FATAL, 0, _("Unable to authenticate File daemon\n")); } else { jcr->authenticated = true; - Dmsg1(110, "OK Authentication Job %s\n", jcr->Job); + Dmsg2(110, "OK Authentication jid=%u Job %s\n", (uint32_t)jcr->JobId, jcr->Job); } if (!jcr->authenticated) { @@ -274,9 +277,9 @@ bool query_cmd(JCR *jcr) } ok = dir_update_device(jcr, device->dev); if (ok) { - ok = bnet_fsend(dir, OK_query); + ok = dir->fsend(OK_query); } else { - bnet_fsend(dir, NO_query); + dir->fsend(NO_query); } return ok; } @@ -289,9 +292,9 @@ bool query_cmd(JCR *jcr) } ok = dir_update_changer(jcr, changer); if (ok) { - ok = bnet_fsend(dir, OK_query); + ok = dir->fsend(OK_query); } else { - bnet_fsend(dir, NO_query); + dir->fsend(NO_query); } return ok; } @@ -299,12 +302,12 @@ bool query_cmd(JCR *jcr) /* If we get here, the device/autochanger was not found */ unbash_spaces(dir->msg); pm_strcpy(jcr->errmsg, dir->msg); - bnet_fsend(dir, NO_device, dev_name.c_str()); + dir->fsend(NO_device, dev_name.c_str()); Dmsg1(100, ">dird: %s\n", dir->msg); } else { unbash_spaces(dir->msg); pm_strcpy(jcr->errmsg, dir->msg); - bnet_fsend(dir, BAD_query, jcr->errmsg); + dir->fsend(BAD_query, jcr->errmsg); Dmsg1(100, ">dird: %s\n", dir->msg); } @@ -322,7 +325,7 @@ void stored_free_jcr(JCR *jcr) { Dmsg1(900, "stored_free_jcr JobId=%u\n", jcr->JobId); if (jcr->file_bsock) { - bnet_close(jcr->file_bsock); + jcr->file_bsock->close(); jcr->file_bsock = NULL; } if (jcr->job_name) { diff --git a/bacula/technotes-2.1 b/bacula/technotes-2.1 index 05a0c6e2b2..1fcf3236c9 100644 --- a/bacula/technotes-2.1 +++ b/bacula/technotes-2.1 @@ -1,6 +1,10 @@ Technical notes on version 2.2 General: +27Sep07 +kes Fix FD->SD authorization failure, which was due to spurious + wakeups from a pthread_cond_timedwait(). Simply check the + predicate before continuing. This fixes bug #953 22Sep07 kes Add code to handle tray monitor separated from Win32 FD. kes Fix display of Win32 tray monitor after reboot. Fixes bug #952. -- 2.39.5