2 This patch should fix bug #1094 where jobs that wrote bytes and are rescheduled
3 do not keep any Pool overrides that were specified in the Schedule Run
4 directives. The patch also simplifies the resource lock inc/dec.
6 Apply it to 2.4.0 with:
9 patch -p0 <2.4.0-reschedule.patch
10 ./configure <your options>
16 Index: src/dird/jobq.c
17 ===================================================================
18 --- src/dird/jobq.c (revision 7178)
19 +++ src/dird/jobq.c (working copy)
22 Bacula® - The Network Backup Solution
24 - Copyright (C) 2003-2007 Free Software Foundation Europe e.V.
25 + Copyright (C) 2003-2008 Free Software Foundation Europe e.V.
27 The main author of Bacula is Kern Sibbald, with contributions from
28 many others, a complete list can be found in the file AUTHORS.
31 static int start_server(jobq_t *jq);
32 static bool acquire_resources(JCR *jcr);
33 +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je);
34 +static void dec_read_store(JCR *jcr);
35 +static void dec_write_store(JCR *jcr);
40 * Initialize a job queue
46 - * if any threads are idle, wake one --
47 - * actually we do a broadcast because on /lib/tls
48 + * if any threads are idle, wake one.
49 + * Actually we do a broadcast because on /lib/tls
50 * these signals seem to get lost from time to time.
52 if (jq->idle_workers > 0) {
54 * put into the ready queue.
56 if (jcr->acquired_resource_locks) {
58 - jcr->rstore->NumConcurrentJobs--;
59 - Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
62 - jcr->wstore->NumConcurrentJobs--;
63 - Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
65 + dec_read_store(jcr);
66 + dec_write_store(jcr);
67 jcr->client->NumConcurrentJobs--;
68 jcr->job->NumConcurrentJobs--;
69 jcr->acquired_resource_locks = false;
73 - * Reschedule the job if necessary and requested
75 - if (jcr->job->RescheduleOnError &&
76 - jcr->JobStatus != JS_Terminated &&
77 - jcr->JobStatus != JS_Canceled &&
78 - jcr->JobType == JT_BACKUP &&
79 - (jcr->job->RescheduleTimes == 0 ||
80 - jcr->reschedule_count < jcr->job->RescheduleTimes)) {
81 - char dt[50], dt2[50];
82 + if (reschedule_job(jcr, jq, je)) {
83 + continue; /* go look for more work */
87 - * Reschedule this job by cleaning it up, but
88 - * reuse the same JobId if possible.
90 - time_t now = time(NULL);
91 - jcr->reschedule_count++;
92 - jcr->sched_time = now + jcr->job->RescheduleInterval;
93 - bstrftime(dt, sizeof(dt), now);
94 - bstrftime(dt2, sizeof(dt2), jcr->sched_time);
95 - Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job,
96 - (int)jcr->job->RescheduleInterval, now, jcr->sched_time);
97 - Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"),
98 - jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2);
99 - dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */
100 - jcr->JobStatus = -1;
101 - set_jcr_job_status(jcr, JS_WaitStartTime);
102 - jcr->SDJobStatus = 0;
103 - if (jcr->JobBytes == 0) {
104 - Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count());
106 - jobq_add(jq, jcr); /* queue the job to run again */
108 - free_jcr(jcr); /* release jcr */
109 - free(je); /* free the job entry */
110 - continue; /* look for another job to run */
113 - * Something was actually backed up, so we cannot reuse
114 - * the old JobId or there will be database record
115 - * conflicts. We now create a new job, copying the
116 - * appropriate fields.
118 - JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr);
119 - set_jcr_defaults(njcr, jcr->job);
120 - njcr->reschedule_count = jcr->reschedule_count;
121 - njcr->sched_time = jcr->sched_time;
122 - njcr->JobLevel = jcr->JobLevel;
123 - njcr->JobStatus = -1;
124 - set_jcr_job_status(njcr, jcr->JobStatus);
126 - copy_rstorage(njcr, jcr->rstorage, _("previous Job"));
128 - free_rstorage(njcr);
131 - copy_wstorage(njcr, jcr->wstorage, _("previous Job"));
133 - free_wstorage(njcr);
135 - njcr->messages = jcr->messages;
136 - Dmsg0(2300, "Call to run new job\n");
138 - run_job(njcr); /* This creates a "new" job */
139 - free_jcr(njcr); /* release "new" jcr */
141 - Dmsg0(2300, "Back from running new job.\n");
143 /* Clean up and release old jcr */
144 Dmsg2(2300, "====== Termination job=%d use_cnt=%d\n", jcr->JobId, jcr->use_count());
145 jcr->SDJobStatus = 0;
150 + * Returns true if cleanup done and we should look for more work
152 +static bool reschedule_job(JCR *jcr, jobq_t *jq, jobq_item_t *je)
155 + * Reschedule the job if necessary and requested
157 + if (jcr->job->RescheduleOnError &&
158 + jcr->JobStatus != JS_Terminated &&
159 + jcr->JobStatus != JS_Canceled &&
160 + jcr->JobType == JT_BACKUP &&
161 + (jcr->job->RescheduleTimes == 0 ||
162 + jcr->reschedule_count < jcr->job->RescheduleTimes)) {
163 + char dt[50], dt2[50];
166 + * Reschedule this job by cleaning it up, but
167 + * reuse the same JobId if possible.
169 + time_t now = time(NULL);
170 + jcr->reschedule_count++;
171 + jcr->sched_time = now + jcr->job->RescheduleInterval;
172 + bstrftime(dt, sizeof(dt), now);
173 + bstrftime(dt2, sizeof(dt2), jcr->sched_time);
174 + Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job,
175 + (int)jcr->job->RescheduleInterval, now, jcr->sched_time);
176 + Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"),
177 + jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2);
178 + dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */
179 + jcr->JobStatus = -1;
180 + set_jcr_job_status(jcr, JS_WaitStartTime);
181 + jcr->SDJobStatus = 0;
182 + if (jcr->JobBytes == 0) {
183 + Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count());
185 + jobq_add(jq, jcr); /* queue the job to run again */
187 + free_jcr(jcr); /* release jcr */
188 + free(je); /* free the job entry */
189 + return true; /* we already cleaned up */
192 + * Something was actually backed up, so we cannot reuse
193 + * the old JobId or there will be database record
194 + * conflicts. We now create a new job, copying the
195 + * appropriate fields.
197 + JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr);
198 + set_jcr_defaults(njcr, jcr->job);
199 + njcr->reschedule_count = jcr->reschedule_count;
200 + njcr->sched_time = jcr->sched_time;
201 + njcr->JobLevel = jcr->JobLevel;
202 + njcr->pool = jcr->pool;
203 + njcr->run_pool_override = jcr->run_pool_override;
204 + njcr->full_pool = jcr->full_pool;
205 + njcr->run_full_pool_override = jcr->run_full_pool_override;
206 + njcr->inc_pool = jcr->inc_pool;
207 + njcr->run_inc_pool_override = jcr->run_inc_pool_override;
208 + njcr->diff_pool = jcr->diff_pool;
209 + njcr->JobStatus = -1;
210 + set_jcr_job_status(njcr, jcr->JobStatus);
212 + copy_rstorage(njcr, jcr->rstorage, _("previous Job"));
214 + free_rstorage(njcr);
217 + copy_wstorage(njcr, jcr->wstorage, _("previous Job"));
219 + free_wstorage(njcr);
221 + njcr->messages = jcr->messages;
222 + njcr->spool_data = jcr->spool_data;
223 + njcr->write_part_after_job = jcr->write_part_after_job;
224 + Dmsg0(2300, "Call to run new job\n");
226 + run_job(njcr); /* This creates a "new" job */
227 + free_jcr(njcr); /* release "new" jcr */
229 + Dmsg0(2300, "Back from running new job.\n");
235 * See if we can acquire all the necessary resources for the job (JCR)
237 * Returns: true if successful
238 @@ -681,11 +697,19 @@
239 bool skip_this_jcr = false;
241 jcr->acquired_resource_locks = false;
242 + if (jcr->rstore == jcr->wstore) { /* deadlock */
243 + Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
244 + " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"),
245 + jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
246 + set_jcr_job_status(jcr, JS_Canceled);
250 Dmsg1(200, "Rstore=%s\n", jcr->rstore->name());
251 if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
252 +// jcr->rstore->NumConcurrentReadJobs++;
253 jcr->rstore->NumConcurrentJobs++;
254 - Dmsg0(200, "Set rncj=1\n");
255 + Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs);
257 Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs);
258 set_jcr_job_status(jcr, JS_WaitStoreRes);
259 @@ -695,25 +719,11 @@
262 Dmsg1(200, "Wstore=%s\n", jcr->wstore->name());
263 - if (jcr->rstore == jcr->wstore) { /* deadlock */
264 - jcr->rstore->NumConcurrentJobs--; /* back out rstore */
265 - Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
266 - " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"),
267 - jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
268 - set_jcr_job_status(jcr, JS_Canceled);
271 - if (jcr->wstore->NumConcurrentJobs == 0 &&
272 - jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) {
273 - /* Simple case, first job */
274 - jcr->wstore->NumConcurrentJobs = 1;
275 - Dmsg0(200, "Set wncj=1\n");
276 - } else if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) {
277 + if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) {
278 jcr->wstore->NumConcurrentJobs++;
279 Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs);
280 } else if (jcr->rstore) {
281 - jcr->rstore->NumConcurrentJobs--; /* back out rstore */
282 - Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
283 + dec_read_store(jcr);
284 skip_this_jcr = true;
286 Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
288 jcr->client->NumConcurrentJobs++;
290 /* Back out previous locks */
292 - jcr->wstore->NumConcurrentJobs--;
293 - Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
296 - jcr->rstore->NumConcurrentJobs--;
297 - Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
299 + dec_write_store(jcr);
300 + dec_read_store(jcr);
301 set_jcr_job_status(jcr, JS_WaitClientRes);
305 jcr->job->NumConcurrentJobs++;
307 /* Back out previous locks */
309 - jcr->wstore->NumConcurrentJobs--;
310 - Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
313 - jcr->rstore->NumConcurrentJobs--;
314 - Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
316 + dec_write_store(jcr);
317 + dec_read_store(jcr);
318 jcr->client->NumConcurrentJobs--;
319 set_jcr_job_status(jcr, JS_WaitJobRes);
322 jcr->acquired_resource_locks = true;
326 +static void dec_read_store(JCR *jcr)
329 +// jcr->rstore->NumConcurrentReadJobs--; /* back out rstore */
330 + jcr->rstore->NumConcurrentJobs--; /* back out rstore */
331 + Dmsg1(200, "Dec rncj=%d\n", jcr->rstore->NumConcurrentJobs);
332 +// ASSERT(jcr->rstore->NumConcurrentReadJobs >= 0);
333 + ASSERT(jcr->rstore->NumConcurrentJobs >= 0);
337 +static void dec_write_store(JCR *jcr)
340 + jcr->wstore->NumConcurrentJobs--;
341 + Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
342 + ASSERT(jcr->wstore->NumConcurrentJobs >= 0);