]> git.sur5r.net Git - bacula/bacula/blob - bacula/patches/2.4.0-reschedule.patch
ebl Backport of the btraceback script that keeps trace file on working dir
[bacula/bacula] / bacula / patches / 2.4.0-reschedule.patch
1
2  This patch should fix bug #1094 where jobs that wrote bytes and are rescheduled
3  do not keep any Pool overrides that were specified in the Schedule Run
4  directives. The patch also simplifies the resource lock inc/dec.   
5
6  Apply it to 2.4.0 with:
7
8  cd <bacula-source>
9  patch -p0 <2.4.0-reschedule.patch
10  ./configure <your options>
11  make
12  ...
13  make install
14
15
16 Index: src/dird/jobq.c
17 ===================================================================
18 --- src/dird/jobq.c     (revision 7178)
19 +++ src/dird/jobq.c     (working copy)
20 @@ -1,7 +1,7 @@
21  /*
22     Bacula® - The Network Backup Solution
23  
24 -   Copyright (C) 2003-2007 Free Software Foundation Europe e.V.
25 +   Copyright (C) 2003-2008 Free Software Foundation Europe e.V.
26  
27     The main author of Bacula is Kern Sibbald, with contributions from
28     many others, a complete list can be found in the file AUTHORS.
29 @@ -56,9 +56,10 @@
30  
31  static int  start_server(jobq_t *jq);
32  static bool acquire_resources(JCR *jcr);
33 +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je);
34 +static void dec_read_store(JCR *jcr);
35 +static void dec_write_store(JCR *jcr);
36  
37 -
38 -
39  /*
40   * Initialize a job queue
41   *
42 @@ -357,8 +358,8 @@
43     pthread_t id;
44  
45     /*
46 -    * if any threads are idle, wake one --                
47 -    *   actually we do a broadcast because on /lib/tls 
48 +    * if any threads are idle, wake one.
49 +    *   Actually we do a broadcast because on /lib/tls 
50      *   these signals seem to get lost from time to time.
51      */
52     if (jq->idle_workers > 0) {
53 @@ -477,87 +478,17 @@
54            *  put into the ready queue.
55            */
56           if (jcr->acquired_resource_locks) {
57 -            if (jcr->rstore) {
58 -               jcr->rstore->NumConcurrentJobs--;
59 -               Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
60 -            }
61 -            if (jcr->wstore) {
62 -               jcr->wstore->NumConcurrentJobs--;
63 -               Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
64 -            }
65 +            dec_read_store(jcr);
66 +            dec_write_store(jcr);
67              jcr->client->NumConcurrentJobs--;
68              jcr->job->NumConcurrentJobs--;
69              jcr->acquired_resource_locks = false;
70           }
71  
72 -         /*
73 -          * Reschedule the job if necessary and requested
74 -          */
75 -         if (jcr->job->RescheduleOnError &&
76 -             jcr->JobStatus != JS_Terminated &&
77 -             jcr->JobStatus != JS_Canceled &&
78 -             jcr->JobType == JT_BACKUP &&
79 -             (jcr->job->RescheduleTimes == 0 ||
80 -              jcr->reschedule_count < jcr->job->RescheduleTimes)) {
81 -             char dt[50], dt2[50];
82 +         if (reschedule_job(jcr, jq, je)) {
83 +            continue;              /* go look for more work */
84 +         }
85  
86 -             /*
87 -              * Reschedule this job by cleaning it up, but
88 -              *  reuse the same JobId if possible.
89 -              */
90 -            time_t now = time(NULL);
91 -            jcr->reschedule_count++;
92 -            jcr->sched_time = now + jcr->job->RescheduleInterval;
93 -            bstrftime(dt, sizeof(dt), now);
94 -            bstrftime(dt2, sizeof(dt2), jcr->sched_time);
95 -            Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job,
96 -                  (int)jcr->job->RescheduleInterval, now, jcr->sched_time);
97 -            Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"),
98 -                 jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2);
99 -            dird_free_jcr_pointers(jcr);     /* partial cleanup old stuff */
100 -            jcr->JobStatus = -1;
101 -            set_jcr_job_status(jcr, JS_WaitStartTime);
102 -            jcr->SDJobStatus = 0;
103 -            if (jcr->JobBytes == 0) {
104 -               Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count());
105 -               V(jq->mutex);
106 -               jobq_add(jq, jcr);     /* queue the job to run again */
107 -               P(jq->mutex);
108 -               free_jcr(jcr);         /* release jcr */
109 -               free(je);              /* free the job entry */
110 -               continue;              /* look for another job to run */
111 -            }
112 -            /*
113 -             * Something was actually backed up, so we cannot reuse
114 -             *   the old JobId or there will be database record
115 -             *   conflicts.  We now create a new job, copying the
116 -             *   appropriate fields.
117 -             */           
118 -            JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr);
119 -            set_jcr_defaults(njcr, jcr->job);
120 -            njcr->reschedule_count = jcr->reschedule_count;
121 -            njcr->sched_time = jcr->sched_time;
122 -            njcr->JobLevel = jcr->JobLevel;
123 -            njcr->JobStatus = -1;
124 -            set_jcr_job_status(njcr, jcr->JobStatus);
125 -            if (jcr->rstore) {
126 -               copy_rstorage(njcr, jcr->rstorage, _("previous Job"));
127 -            } else {
128 -               free_rstorage(njcr);
129 -            }
130 -            if (jcr->wstore) {
131 -               copy_wstorage(njcr, jcr->wstorage, _("previous Job"));
132 -            } else {
133 -               free_wstorage(njcr);
134 -            }
135 -            njcr->messages = jcr->messages;
136 -            Dmsg0(2300, "Call to run new job\n");
137 -            V(jq->mutex);
138 -            run_job(njcr);            /* This creates a "new" job */
139 -            free_jcr(njcr);           /* release "new" jcr */
140 -            P(jq->mutex);
141 -            Dmsg0(2300, "Back from running new job.\n");
142 -         }
143           /* Clean up and release old jcr */
144           Dmsg2(2300, "====== Termination job=%d use_cnt=%d\n", jcr->JobId, jcr->use_count());
145           jcr->SDJobStatus = 0;
146 @@ -671,6 +602,91 @@
147  }
148  
149  /*
150 + * Returns true if cleanup done and we should look for more work
151 + */
152 +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je)
153 +{
154 +   /*
155 +    * Reschedule the job if necessary and requested
156 +    */
157 +   if (jcr->job->RescheduleOnError &&
158 +       jcr->JobStatus != JS_Terminated &&
159 +       jcr->JobStatus != JS_Canceled &&
160 +       jcr->JobType == JT_BACKUP &&
161 +       (jcr->job->RescheduleTimes == 0 ||
162 +        jcr->reschedule_count < jcr->job->RescheduleTimes)) {
163 +       char dt[50], dt2[50];
164 +
165 +       /*
166 +        * Reschedule this job by cleaning it up, but
167 +        *  reuse the same JobId if possible.
168 +        */
169 +      time_t now = time(NULL);
170 +      jcr->reschedule_count++;
171 +      jcr->sched_time = now + jcr->job->RescheduleInterval;
172 +      bstrftime(dt, sizeof(dt), now);
173 +      bstrftime(dt2, sizeof(dt2), jcr->sched_time);
174 +      Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job,
175 +            (int)jcr->job->RescheduleInterval, now, jcr->sched_time);
176 +      Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"),
177 +           jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2);
178 +      dird_free_jcr_pointers(jcr);     /* partial cleanup old stuff */
179 +      jcr->JobStatus = -1;
180 +      set_jcr_job_status(jcr, JS_WaitStartTime);
181 +      jcr->SDJobStatus = 0;
182 +      if (jcr->JobBytes == 0) {
183 +         Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count());
184 +         V(jq->mutex);
185 +         jobq_add(jq, jcr);     /* queue the job to run again */
186 +         P(jq->mutex);
187 +         free_jcr(jcr);         /* release jcr */
188 +         free(je);              /* free the job entry */
189 +         return true;           /* we already cleaned up */
190 +      }
191 +      /*
192 +       * Something was actually backed up, so we cannot reuse
193 +       *   the old JobId or there will be database record
194 +       *   conflicts.  We now create a new job, copying the
195 +       *   appropriate fields.
196 +       */           
197 +      JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr);
198 +      set_jcr_defaults(njcr, jcr->job);
199 +      njcr->reschedule_count = jcr->reschedule_count;
200 +      njcr->sched_time = jcr->sched_time;
201 +      njcr->JobLevel = jcr->JobLevel;
202 +      njcr->pool = jcr->pool;
203 +      njcr->run_pool_override = jcr->run_pool_override;
204 +      njcr->full_pool = jcr->full_pool;
205 +      njcr->run_full_pool_override = jcr->run_full_pool_override;
206 +      njcr->inc_pool = jcr->inc_pool;
207 +      njcr->run_inc_pool_override = jcr->run_inc_pool_override;
208 +      njcr->diff_pool = jcr->diff_pool;
209 +      njcr->JobStatus = -1;
210 +      set_jcr_job_status(njcr, jcr->JobStatus);
211 +      if (jcr->rstore) {
212 +         copy_rstorage(njcr, jcr->rstorage, _("previous Job"));
213 +      } else {
214 +         free_rstorage(njcr);
215 +      }
216 +      if (jcr->wstore) {
217 +         copy_wstorage(njcr, jcr->wstorage, _("previous Job"));
218 +      } else {
219 +         free_wstorage(njcr);
220 +      }
221 +      njcr->messages = jcr->messages;
222 +      njcr->spool_data = jcr->spool_data;
223 +      njcr->write_part_after_job = jcr->write_part_after_job;
224 +      Dmsg0(2300, "Call to run new job\n");
225 +      V(jq->mutex);
226 +      run_job(njcr);            /* This creates a "new" job */
227 +      free_jcr(njcr);           /* release "new" jcr */
228 +      P(jq->mutex);
229 +      Dmsg0(2300, "Back from running new job.\n");
230 +   }
231 +   return false;
232 +}
233 +
234 +/*
235   * See if we can acquire all the necessary resources for the job (JCR)
236   *
237   *  Returns: true  if successful
238 @@ -681,11 +697,19 @@
239     bool skip_this_jcr = false;
240  
241     jcr->acquired_resource_locks = false;
242 +   if (jcr->rstore == jcr->wstore) {           /* deadlock */
243 +      Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
244 +         "    Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), 
245 +         jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
246 +      set_jcr_job_status(jcr, JS_Canceled);
247 +      return false;
248 +   }
249     if (jcr->rstore) {
250        Dmsg1(200, "Rstore=%s\n", jcr->rstore->name());
251        if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
252 +//       jcr->rstore->NumConcurrentReadJobs++;
253           jcr->rstore->NumConcurrentJobs++;
254 -         Dmsg0(200, "Set rncj=1\n");
255 +         Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs);
256        } else {
257           Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs);
258           set_jcr_job_status(jcr, JS_WaitStoreRes);
259 @@ -695,25 +719,11 @@
260     
261     if (jcr->wstore) {
262        Dmsg1(200, "Wstore=%s\n", jcr->wstore->name());
263 -      if (jcr->rstore == jcr->wstore) {           /* deadlock */
264 -         jcr->rstore->NumConcurrentJobs--;        /* back out rstore */
265 -         Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
266 -            "    Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"), 
267 -            jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
268 -         set_jcr_job_status(jcr, JS_Canceled);
269 -         return false;
270 -      }
271 -      if (jcr->wstore->NumConcurrentJobs == 0 &&
272 -          jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) {
273 -         /* Simple case, first job */
274 -         jcr->wstore->NumConcurrentJobs = 1;
275 -         Dmsg0(200, "Set wncj=1\n");
276 -      } else if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) {
277 +      if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) {
278           jcr->wstore->NumConcurrentJobs++;
279           Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs);
280        } else if (jcr->rstore) {
281 -         jcr->rstore->NumConcurrentJobs--;        /* back out rstore */
282 -         Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
283 +         dec_read_store(jcr);
284           skip_this_jcr = true;
285        } else {
286           Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
287 @@ -729,14 +739,8 @@
288        jcr->client->NumConcurrentJobs++;
289     } else {
290        /* Back out previous locks */
291 -      if (jcr->wstore) {
292 -         jcr->wstore->NumConcurrentJobs--;
293 -         Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
294 -      }
295 -      if (jcr->rstore) {
296 -         jcr->rstore->NumConcurrentJobs--;
297 -         Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
298 -      }
299 +      dec_write_store(jcr);
300 +      dec_read_store(jcr);
301        set_jcr_job_status(jcr, JS_WaitClientRes);
302        return false;
303     }
304 @@ -744,14 +748,8 @@
305        jcr->job->NumConcurrentJobs++;
306     } else {
307        /* Back out previous locks */
308 -      if (jcr->wstore) {
309 -         jcr->wstore->NumConcurrentJobs--;
310 -         Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
311 -      }
312 -      if (jcr->rstore) {
313 -         jcr->rstore->NumConcurrentJobs--;
314 -         Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
315 -      }
316 +      dec_write_store(jcr);
317 +      dec_read_store(jcr);
318        jcr->client->NumConcurrentJobs--;
319        set_jcr_job_status(jcr, JS_WaitJobRes);
320        return false;
321 @@ -760,3 +758,23 @@
322     jcr->acquired_resource_locks = true;
323     return true;
324  }
325 +
326 +static void dec_read_store(JCR *jcr)
327 +{
328 +   if (jcr->rstore) {
329 +//    jcr->rstore->NumConcurrentReadJobs--;    /* back out rstore */
330 +      jcr->rstore->NumConcurrentJobs--;        /* back out rstore */
331 +      Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
332 +//    ASSERT(jcr->rstore->NumConcurrentReadJobs >= 0);
333 +      ASSERT(jcr->rstore->NumConcurrentJobs >= 0);
334 +   }
335 +}
336 +
337 +static void dec_write_store(JCR *jcr)
338 +{
339 +   if (jcr->wstore) {
340 +      jcr->wstore->NumConcurrentJobs--;
341 +      Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
342 +      ASSERT(jcr->wstore->NumConcurrentJobs >= 0);
343 +   }
344 +}