X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=bacula%2Fpatches%2Ftesting%2Fbug_1166_cancel_read.patch;h=f57ce91d477b73abcc22c31f7f0aa59efa2f7414;hb=4c606c98c24f2070e86ec5b99da76b65341fdb3a;hp=c25e1d017b686566d7781ba4ce6998094841f850;hpb=1d004c676c8f9884dc2c7fe8266b3660019f8fc4;p=bacula%2Fbacula diff --git a/bacula/patches/testing/bug_1166_cancel_read.patch b/bacula/patches/testing/bug_1166_cancel_read.patch index c25e1d017b..f57ce91d47 100644 --- a/bacula/patches/testing/bug_1166_cancel_read.patch +++ b/bacula/patches/testing/bug_1166_cancel_read.patch @@ -1,118 +1,128 @@ Index: src/dird/backup.c =================================================================== ---- src/dird/backup.c (revision 7725) -+++ src/dird/backup.c (working copy) -@@ -193,6 +193,7 @@ - Jmsg(jcr, M_INFO, 0, _("Start Backup JobId %s, Job=%s\n"), - edit_uint64(jcr->JobId, ed1), jcr->Job); +--- src/dird/backup.c (révision 7736) ++++ src/dird/backup.c (copie de travail) +@@ -326,24 +326,26 @@ + } + return false; -+ jcr->set_owner(); /* we are responsible for this job */ - set_jcr_job_status(jcr, JS_Running); - Dmsg2(100, "JobId=%d JobLevel=%c\n", jcr->jr.JobId, jcr->jr.JobLevel); - if (!db_update_job_start_record(jcr, jcr->db, &jcr->jr)) { -@@ -361,8 +362,10 @@ - if (timeout) { - tid = start_bsock_timer(fd, timeout); /* TODO: New timeout directive??? */ - } -+ Dmsg0(1, "======== Wait for a message\n"); - /* Wait for Client to terminate */ - while ((n = bget_dirmsg(fd)) >= 0) { -+ Dmsg1(1, "======= Get client message=%s\n", fd->msg); - if (!fd_ok && - (sscanf(fd->msg, EndJob, &jcr->FDJobStatus, &JobFiles, - &ReadBytes, &JobBytes, &Errors, &VSS, &Encrypt) == 7 || -Index: src/dird/job.c -=================================================================== ---- src/dird/job.c (revision 7731) -+++ src/dird/job.c (working copy) -@@ -390,6 +390,14 @@ - jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ - return true; - -+ case JS_Running: -+ if (jcr->get_JobType() == JT_BACKUP && jcr->file_bsock) { -+ /* When in JS_Running state, the main thread can wait for -+ * EndJob message from the Client. We send a signal to the job -+ * thread to cancel the read() -+ */ -+ jcr->send_signal_to_owner(TIMEOUT_SIGNAL); -+ } - default: - /* Cancel File daemon */ - if (jcr->file_bsock) { -Index: src/jcr.h -=================================================================== ---- src/jcr.h (revision 7725) -+++ src/jcr.h (working copy) -@@ -180,6 +180,8 @@ - bool is_job_canceled() {return job_canceled(this); }; - int32_t get_JobType() { return m_JobType; }; - int32_t get_JobLevel() { return m_JobLevel; }; -+ void set_owner(); -+ int send_signal_to_owner(int signal); - - const char *get_OperationName(); /* in lib/jcr.c */ - const char *get_ActionName(bool past); /* in lib/jcr.c */ -Index: src/lib/jcr.c -=================================================================== ---- src/lib/jcr.c (revision 7725) -+++ src/lib/jcr.c (working copy) -@@ -303,6 +303,31 @@ +-/* Come here only after starting SD thread */ ++/* Come here only after starting SD thread ++ * and we don't expect any EndJob message because the ++ * the client don't have recieve the "backup" command. ++ */ + bail_out: + set_jcr_job_status(jcr, JS_ErrorTerminated); +- Dmsg1(400, "wait for sd. use=%d\n", jcr->use_count()); +- /* Cancel SD */ +- wait_for_job_termination(jcr, FDConnectTimeout); +- Dmsg1(400, "after wait for sd. use=%d\n", jcr->use_count()); ++ Dmsg1(400, "wait for sd and fd. use=%d\n", jcr->use_count()); ++ /* Get status from SD and FD */ ++ wait_for_job_termination(jcr, false /* don't expect EndJob message*/); ++ Dmsg1(400, "after wait for sd and fd. use=%d\n", jcr->use_count()); + return false; } +- /* -+ * Update the my_thread_id variable with the current thread id -+ */ -+void JCR::set_owner() -+{ -+ Dmsg1(0, "JCR::set_owner(%p)\n", pthread_self()); -+ this->my_thread_id = pthread_self(); -+} -+ -+/* -+ * Send a signal to the JCR thread owner. (to break a system call) -+ * (Must have use set_owner() before) -+ */ -+int JCR::send_signal_to_owner(int sig) -+{ -+ int ret=0; -+ Dmsg2(0, "Sending TIMEOUT ?? to %p by %p\n", this->my_thread_id, pthread_self()); -+ -+ if (!pthread_equal(this->my_thread_id, pthread_self())) { -+ Dmsg2(0, "Sending TIMEOUT to %p by %p\n", this->my_thread_id, pthread_self()); -+ ret = pthread_kill(this->my_thread_id, sig); -+ } -+ return ret; -+} -+ -+/* - * Push a subroutine address into the job end callback stack + * Here we wait for the File daemon to signal termination, + * then we wait for the Storage daemon. When both + * are done, we return the job status. + * Also used by restore.c */ - void job_end_push(JCR *jcr, void job_end_cb(JCR *jcr,void *), void *ctx) -@@ -719,7 +744,7 @@ - bool set_waittime = false; - int oldJobStatus = jcr->JobStatus; +-int wait_for_job_termination(JCR *jcr, int timeout) ++int wait_for_job_termination(JCR *jcr, bool expect_EndJob) + { + int32_t n = 0; + BSOCK *fd = jcr->file_bsock; +@@ -353,40 +355,49 @@ + uint64_t JobBytes = 0; + int VSS = 0; + int Encrypt = 0; +- btimer_t *tid=NULL; +- + set_jcr_job_status(jcr, JS_Running); -- Dmsg2(800, "set_jcr_job_status(%s, %c)\n", jcr->Job, JobStatus); -+ Dmsg2(2, "set_jcr_job_status(%s, %c)\n", jcr->Job, JobStatus); - /* if wait state is new, we keep current time for watchdog MaxWaitTime */ - switch (JobStatus) { - case JS_WaitFD: -@@ -740,7 +765,7 @@ - * For a set of errors, ... keep the current status - * so it isn't lost. For all others, set it. - */ -- Dmsg3(300, "jid=%u OnEntry JobStatus=%c set=%c\n", (uint32_t)jcr->JobId, -+ Dmsg3(2, "jid=%u OnEntry JobStatus=%c set=%c\n", (uint32_t)jcr->JobId, - jcr->JobStatus, JobStatus); - switch (jcr->JobStatus) { - case JS_ErrorTerminated: -@@ -781,7 +806,7 @@ + if (fd) { +- if (timeout) { +- tid = start_bsock_timer(fd, timeout); /* TODO: New timeout directive??? */ +- } +- /* Wait for Client to terminate */ +- while ((n = bget_dirmsg(fd)) >= 0) { +- if (!fd_ok && +- (sscanf(fd->msg, EndJob, &jcr->FDJobStatus, &JobFiles, +- &ReadBytes, &JobBytes, &Errors, &VSS, &Encrypt) == 7 || +- sscanf(fd->msg, OldEndJob, &jcr->FDJobStatus, &JobFiles, +- &ReadBytes, &JobBytes, &Errors) == 5)) { +- fd_ok = true; +- set_jcr_job_status(jcr, jcr->FDJobStatus); +- Dmsg1(100, "FDStatus=%c\n", (char)jcr->JobStatus); +- } else { +- Jmsg(jcr, M_WARNING, 0, _("Unexpected Client Job message: %s\n"), +- fd->msg); ++ /* Wait for Client to terminate ++ * In some conditions, the client isn't able to send ++ * any messages and we should not wait for ages ++ */ ++ int OK=true; ++ int ret; ++ while (OK && expect_EndJob) { ++ ++ /* Even if the job is canceled, we let a chance to FD to ++ * send EndJob message ++ */ ++ if (job_canceled(jcr)) { ++ OK=false; + } +- if (job_canceled(jcr)) { +- break; ++ ++ /* wait for data few minutes */ ++ ret = fd->wait_data_intr(3*60, 0); ++ if (ret == 1) { /* get data */ ++ n = bget_dirmsg(fd); ++ if (n >= 0 && ++ (sscanf(fd->msg, EndJob, &jcr->FDJobStatus, &JobFiles, ++ &ReadBytes, &JobBytes, &Errors, &VSS, &Encrypt) == 7 || ++ sscanf(fd->msg, OldEndJob, &jcr->FDJobStatus, &JobFiles, ++ &ReadBytes, &JobBytes, &Errors) == 5)) { ++ fd_ok = true; ++ set_jcr_job_status(jcr, jcr->FDJobStatus); ++ OK=false; /* end of loop */ ++ } else { ++ Jmsg(jcr, M_WARNING, 0, _("Unexpected Client Job message: %s\n"), ++ fd->msg); ++ } ++ } /* else get timeout or network error */ ++ ++ if (is_bnet_error(fd)) { ++ Jmsg(jcr, M_FATAL, 0, _("Network error with FD during %s: ERR=%s\n"), ++ job_type_to_str(jcr->get_JobType()), fd->bstrerror()); ++ OK=false; + } } +- if (tid) { +- stop_bsock_timer(tid); +- } + +- if (is_bnet_error(fd)) { +- Jmsg(jcr, M_FATAL, 0, _("Network error with FD during %s: ERR=%s\n"), +- job_type_to_str(jcr->get_JobType()), fd->bstrerror()); +- } + fd->signal(BNET_TERMINATE); /* tell Client we are terminating */ } - if (oldJobStatus != jcr->JobStatus) { -- Dmsg3(200, "jid=%u leave set_old_job_status=%c new_set=%c\n", (uint32_t)jcr->JobId, -+ Dmsg3(2, "jid=%u leave set_old_job_status=%c new_set=%c\n", (uint32_t)jcr->JobId, - oldJobStatus, JobStatus); - // generate_plugin_event(jcr, bEventStatusChange, NULL); - } + +Index: src/dird/protos.h +=================================================================== +--- src/dird/protos.h (révision 7736) ++++ src/dird/protos.h (copie de travail) +@@ -52,7 +52,7 @@ + extern bool find_recycled_volume(JCR *jcr, bool InChanger, MEDIA_DBR *mr); + + /* backup.c */ +-extern int wait_for_job_termination(JCR *jcr, int timeout=0); ++extern int wait_for_job_termination(JCR *jcr, bool expect_EndJob=true); + extern bool do_backup_init(JCR *jcr); + extern bool do_backup(JCR *jcr); + extern void backup_cleanup(JCR *jcr, int TermCode);