2 This patch fixes a problem when canceling job if client looses
3 connection while being backed up
4 Apply the patch to version 2.4.3 (and previous versions) with:
7 patch -p0 <2.4.3-cancel-after-network-outage.patch
8 ./configure <your-options>
14 Index: src/dird/backup.c
15 ===================================================================
16 --- src/dird/backup.c (révision 7772)
17 +++ src/dird/backup.c (copie de travail)
22 -/* Come here only after starting SD thread */
23 +/* Come here only after starting SD thread
24 + * and we don't expect any EndJob message because the
25 + * the client don't have recieve the "backup" command.
28 set_jcr_job_status(jcr, JS_ErrorTerminated);
29 - Dmsg1(400, "wait for sd. use=%d\n", jcr->use_count());
31 - cancel_storage_daemon_job(jcr);
32 - wait_for_storage_daemon_termination(jcr);
33 - Dmsg1(400, "after wait for sd. use=%d\n", jcr->use_count());
34 + Dmsg1(400, "wait for sd and fd. use=%d\n", jcr->use_count());
35 + /* Get status from SD and FD */
36 + wait_for_job_termination(jcr, false /* don't expect EndJob message*/);
37 + Dmsg1(400, "after wait for sd and fd. use=%d\n", jcr->use_count());
42 * are done, we return the job status.
43 * Also used by restore.c
45 -int wait_for_job_termination(JCR *jcr)
46 +int wait_for_job_termination(JCR *jcr, bool expect_EndJob)
49 BSOCK *fd = jcr->file_bsock;
53 set_jcr_job_status(jcr, JS_Running);
54 - /* Wait for Client to terminate */
55 - while ((n = bget_dirmsg(fd)) >= 0) {
57 - (sscanf(fd->msg, EndJob, &jcr->FDJobStatus, &JobFiles,
58 - &ReadBytes, &JobBytes, &Errors, &VSS, &Encrypt) == 7 ||
59 - sscanf(fd->msg, OldEndJob, &jcr->FDJobStatus, &JobFiles,
60 - &ReadBytes, &JobBytes, &Errors) == 5)) {
62 - set_jcr_job_status(jcr, jcr->FDJobStatus);
63 - Dmsg1(100, "FDStatus=%c\n", (char)jcr->JobStatus);
65 - Jmsg(jcr, M_WARNING, 0, _("Unexpected Client Job message: %s\n"),
70 + /* Wait for Client to terminate
71 + * In some conditions, the client isn't able to send
72 + * any messages and we should not wait for ages
76 + while (OK && expect_EndJob) {
78 + /* Even if the job is canceled, we let a chance to FD to
79 + * send EndJob message
81 + if (job_canceled(jcr)) {
85 + /* wait for data few minutes */
86 + ret = fd->wait_data_intr(3*60, 0);
87 + if (ret == 1) { /* get data */
88 + n = bget_dirmsg(fd);
90 + (sscanf(fd->msg, EndJob, &jcr->FDJobStatus, &JobFiles,
91 + &ReadBytes, &JobBytes, &Errors, &VSS, &Encrypt) == 7 ||
92 + sscanf(fd->msg, OldEndJob, &jcr->FDJobStatus, &JobFiles,
93 + &ReadBytes, &JobBytes, &Errors) == 5)) {
95 + set_jcr_job_status(jcr, jcr->FDJobStatus);
96 + OK=false; /* end of loop */
98 + Jmsg(jcr, M_WARNING, 0, _("Unexpected Client Job message: %s\n"),
101 + } /* else get timeout or network error */
103 + if (is_bnet_error(fd)) {
104 + Jmsg(jcr, M_FATAL, 0, _("Network error with FD during %s: ERR=%s\n"),
105 + job_type_to_str(jcr->JobType), fd->bstrerror());
109 - if (job_canceled(jcr)) {
114 - if (is_bnet_error(fd)) {
115 - Jmsg(jcr, M_FATAL, 0, _("Network error with FD during %s: ERR=%s\n"),
116 - job_type_to_str(jcr->JobType), fd->bstrerror());
117 + fd->signal(BNET_TERMINATE); /* tell Client we are terminating */
119 - fd->signal(BNET_TERMINATE); /* tell Client we are terminating */
121 /* Force cancel in SD if failing */
122 if (job_canceled(jcr) || !fd_ok) {
123 Index: src/dird/protos.h
124 ===================================================================
125 --- src/dird/protos.h (révision 7772)
126 +++ src/dird/protos.h (copie de travail)
128 extern bool find_recycled_volume(JCR *jcr, bool InChanger, MEDIA_DBR *mr);
131 -extern int wait_for_job_termination(JCR *jcr);
132 +extern int wait_for_job_termination(JCR *jcr, bool expect_EndJob=true);
133 extern bool do_backup_init(JCR *jcr);
134 extern bool do_backup(JCR *jcr);
135 extern void backup_cleanup(JCR *jcr, int TermCode);