2 Bacula® - The Network Backup Solution
4 Copyright (C) 2003-2008 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version two of the GNU General Public
10 License as published by the Free Software Foundation and included
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of John Walker.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Bacula job queue routines.
31 * This code consists of three queues, the waiting_jobs
32 * queue, where jobs are initially queued, the ready_jobs
33 * queue, where jobs are placed when all the resources are
34 * allocated and they can immediately be run, and the
35 * running queue where jobs are placed when they are
38 * Kern Sibbald, July MMIII
42 * This code was adapted from the Bacula workq, which was
43 * adapted from "Programming with POSIX Threads", by
53 /* Forward referenced functions */
54 extern "C" void *jobq_server(void *arg);
55 extern "C" void *sched_wait(void *arg);
57 static int start_server(jobq_t *jq);
58 static bool acquire_resources(JCR *jcr);
59 static void dec_read_store(JCR *jcr);
60 static void dec_write_store(JCR *jcr);
63 * Initialize a job queue
65 * Returns: 0 on success
68 int jobq_init(jobq_t *jq, int threads, void *(*engine)(void *arg))
71 jobq_item_t *item = NULL;
73 if ((stat = pthread_attr_init(&jq->attr)) != 0) {
75 Jmsg1(NULL, M_ERROR, 0, _("pthread_attr_init: ERR=%s\n"), be.bstrerror(stat));
78 if ((stat = pthread_attr_setdetachstate(&jq->attr, PTHREAD_CREATE_DETACHED)) != 0) {
79 pthread_attr_destroy(&jq->attr);
82 if ((stat = pthread_mutex_init(&jq->mutex, NULL)) != 0) {
84 Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_init: ERR=%s\n"), be.bstrerror(stat));
85 pthread_attr_destroy(&jq->attr);
88 if ((stat = pthread_cond_init(&jq->work, NULL)) != 0) {
90 Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_init: ERR=%s\n"), be.bstrerror(stat));
91 pthread_mutex_destroy(&jq->mutex);
92 pthread_attr_destroy(&jq->attr);
96 jq->max_workers = threads; /* max threads to create */
97 jq->num_workers = 0; /* no threads yet */
98 jq->idle_workers = 0; /* no idle threads */
99 jq->engine = engine; /* routine to run */
100 jq->valid = JOBQ_VALID;
101 /* Initialize the job queues */
102 jq->waiting_jobs = New(dlist(item, &item->link));
103 jq->running_jobs = New(dlist(item, &item->link));
104 jq->ready_jobs = New(dlist(item, &item->link));
109 * Destroy the job queue
111 * Returns: 0 on success
114 int jobq_destroy(jobq_t *jq)
116 int stat, stat1, stat2;
118 if (jq->valid != JOBQ_VALID) {
121 if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) {
123 Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.bstrerror(stat));
126 jq->valid = 0; /* prevent any more operations */
129 * If any threads are active, wake them
131 if (jq->num_workers > 0) {
133 if (jq->idle_workers) {
134 if ((stat = pthread_cond_broadcast(&jq->work)) != 0) {
136 Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_broadcast: ERR=%s\n"), be.bstrerror(stat));
137 pthread_mutex_unlock(&jq->mutex);
141 while (jq->num_workers > 0) {
142 if ((stat = pthread_cond_wait(&jq->work, &jq->mutex)) != 0) {
144 Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_wait: ERR=%s\n"), be.bstrerror(stat));
145 pthread_mutex_unlock(&jq->mutex);
150 if ((stat = pthread_mutex_unlock(&jq->mutex)) != 0) {
152 Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_unlock: ERR=%s\n"), be.bstrerror(stat));
155 stat = pthread_mutex_destroy(&jq->mutex);
156 stat1 = pthread_cond_destroy(&jq->work);
157 stat2 = pthread_attr_destroy(&jq->attr);
158 delete jq->waiting_jobs;
159 delete jq->running_jobs;
160 delete jq->ready_jobs;
161 return (stat != 0 ? stat : (stat1 != 0 ? stat1 : stat2));
170 * Wait until schedule time arrives before starting. Normally
171 * this routine is only used for jobs started from the console
172 * for which the user explicitly specified a start time. Otherwise
173 * most jobs are put into the job queue only when their
174 * scheduled time arives.
177 void *sched_wait(void *arg)
179 JCR *jcr = ((wait_pkt *)arg)->jcr;
180 jobq_t *jq = ((wait_pkt *)arg)->jq;
182 Dmsg0(2300, "Enter sched_wait.\n");
184 time_t wtime = jcr->sched_time - time(NULL);
185 set_jcr_job_status(jcr, JS_WaitStartTime);
186 /* Wait until scheduled time arrives */
188 Jmsg(jcr, M_INFO, 0, _("Job %s waiting %d seconds for scheduled start time.\n"),
191 /* Check every 30 seconds if canceled */
193 Dmsg3(2300, "Waiting on sched time, jobid=%d secs=%d use=%d\n",
194 jcr->JobId, wtime, jcr->use_count());
198 bmicrosleep(wtime, 0);
199 if (job_canceled(jcr)) {
202 wtime = jcr->sched_time - time(NULL);
204 Dmsg1(200, "resched use=%d\n", jcr->use_count());
206 free_jcr(jcr); /* we are done with jcr */
207 Dmsg0(2300, "Exit sched_wait\n");
212 * Add a job to the queue
213 * jq is a queue that was created with jobq_init
215 int jobq_add(jobq_t *jq, JCR *jcr)
218 jobq_item_t *item, *li;
219 bool inserted = false;
220 time_t wtime = jcr->sched_time - time(NULL);
224 if (!jcr->term_wait_inited) {
225 /* Initialize termination condition variable */
226 if ((stat = pthread_cond_init(&jcr->term_wait, NULL)) != 0) {
228 Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), be.bstrerror(stat));
231 jcr->term_wait_inited = true;
234 Dmsg3(2300, "jobq_add jobid=%d jcr=0x%x use_count=%d\n", jcr->JobId, jcr, jcr->use_count());
235 if (jq->valid != JOBQ_VALID) {
236 Jmsg0(jcr, M_ERROR, 0, "Jobq_add queue not initialized.\n");
240 jcr->inc_use_count(); /* mark jcr in use by us */
241 Dmsg3(2300, "jobq_add jobid=%d jcr=0x%x use_count=%d\n", jcr->JobId, jcr, jcr->use_count());
242 if (!job_canceled(jcr) && wtime > 0) {
243 set_thread_concurrency(jq->max_workers + 2);
244 sched_pkt = (wait_pkt *)malloc(sizeof(wait_pkt));
245 sched_pkt->jcr = jcr;
247 stat = pthread_create(&id, &jq->attr, sched_wait, (void *)sched_pkt);
248 if (stat != 0) { /* thread not created */
250 Jmsg1(jcr, M_ERROR, 0, _("pthread_thread_create: ERR=%s\n"), be.bstrerror(stat));
255 if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) {
257 Jmsg1(jcr, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.bstrerror(stat));
258 free_jcr(jcr); /* release jcr */
262 if ((item = (jobq_item_t *)malloc(sizeof(jobq_item_t))) == NULL) {
263 free_jcr(jcr); /* release jcr */
268 if (job_canceled(jcr)) {
269 /* Add job to ready queue so that it is canceled quickly */
270 jq->ready_jobs->prepend(item);
271 Dmsg1(2300, "Prepended job=%d to ready queue\n", jcr->JobId);
273 /* Add this job to the wait queue in priority sorted order */
274 foreach_dlist(li, jq->waiting_jobs) {
275 Dmsg2(2300, "waiting item jobid=%d priority=%d\n",
276 li->jcr->JobId, li->jcr->JobPriority);
277 if (li->jcr->JobPriority > jcr->JobPriority) {
278 jq->waiting_jobs->insert_before(item, li);
279 Dmsg2(2300, "insert_before jobid=%d before waiting job=%d\n",
280 li->jcr->JobId, jcr->JobId);
285 /* If not jobs in wait queue, append it */
287 jq->waiting_jobs->append(item);
288 Dmsg1(2300, "Appended item jobid=%d to waiting queue\n", jcr->JobId);
292 /* Ensure that at least one server looks at the queue. */
293 stat = start_server(jq);
295 pthread_mutex_unlock(&jq->mutex);
296 Dmsg0(2300, "Return jobq_add\n");
301 * Remove a job from the job queue. Used only by cancel_job().
302 * jq is a queue that was created with jobq_init
303 * work_item is an element of work
305 * Note, it is "removed" from the job queue.
306 * If you want to cancel it, you need to provide some external means
307 * of doing so (e.g. pthread_kill()).
309 int jobq_remove(jobq_t *jq, JCR *jcr)
315 Dmsg2(2300, "jobq_remove jobid=%d jcr=0x%x\n", jcr->JobId, jcr);
316 if (jq->valid != JOBQ_VALID) {
320 if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) {
322 Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.bstrerror(stat));
326 foreach_dlist(item, jq->waiting_jobs) {
327 if (jcr == item->jcr) {
333 pthread_mutex_unlock(&jq->mutex);
334 Dmsg2(2300, "jobq_remove jobid=%d jcr=0x%x not in wait queue\n", jcr->JobId, jcr);
338 /* Move item to be the first on the list */
339 jq->waiting_jobs->remove(item);
340 jq->ready_jobs->prepend(item);
341 Dmsg2(2300, "jobq_remove jobid=%d jcr=0x%x moved to ready queue\n", jcr->JobId, jcr);
343 stat = start_server(jq);
345 pthread_mutex_unlock(&jq->mutex);
346 Dmsg0(2300, "Return jobq_remove\n");
352 * Start the server thread if it isn't already running
354 static int start_server(jobq_t *jq)
360 * if any threads are idle, wake one.
361 * Actually we do a broadcast because on /lib/tls
362 * these signals seem to get lost from time to time.
364 if (jq->idle_workers > 0) {
365 Dmsg0(2300, "Signal worker to wake up\n");
366 if ((stat = pthread_cond_broadcast(&jq->work)) != 0) {
368 Jmsg1(NULL, M_ERROR, 0, _("pthread_cond_signal: ERR=%s\n"), be.bstrerror(stat));
371 } else if (jq->num_workers < jq->max_workers) {
372 Dmsg0(2300, "Create worker thread\n");
373 /* No idle threads so create a new one */
374 set_thread_concurrency(jq->max_workers + 1);
375 if ((stat = pthread_create(&id, &jq->attr, jobq_server, (void *)jq)) != 0) {
377 Jmsg1(NULL, M_ERROR, 0, _("pthread_create: ERR=%s\n"), be.bstrerror(stat));
386 * This is the worker thread that serves the job queue.
387 * When all the resources are acquired for the job,
388 * it will call the user's engine.
391 void *jobq_server(void *arg)
393 struct timespec timeout;
394 jobq_t *jq = (jobq_t *)arg;
395 jobq_item_t *je; /* job entry in queue */
397 bool timedout = false;
400 Dmsg0(2300, "Start jobq_server\n");
401 if ((stat = pthread_mutex_lock(&jq->mutex)) != 0) {
403 Jmsg1(NULL, M_ERROR, 0, _("pthread_mutex_lock: ERR=%s\n"), be.bstrerror(stat));
412 Dmsg0(2300, "Top of for loop\n");
413 if (!work && !jq->quit) {
414 gettimeofday(&tv, &tz);
416 timeout.tv_sec = tv.tv_sec + 4;
420 * Wait 4 seconds, then if no more work, exit
422 Dmsg0(2300, "pthread_cond_timedwait()\n");
423 stat = pthread_cond_timedwait(&jq->work, &jq->mutex, &timeout);
424 if (stat == ETIMEDOUT) {
425 Dmsg0(2300, "timedwait timedout.\n");
428 } else if (stat != 0) {
429 /* This shouldn't happen */
430 Dmsg0(2300, "This shouldn't happen\n");
432 pthread_mutex_unlock(&jq->mutex);
439 * If anything is in the ready queue, run it
441 Dmsg0(2300, "Checking ready queue.\n");
442 while (!jq->ready_jobs->empty() && !jq->quit) {
444 je = (jobq_item_t *)jq->ready_jobs->first();
446 jq->ready_jobs->remove(je);
447 if (!jq->ready_jobs->empty()) {
448 Dmsg0(2300, "ready queue not empty start server\n");
449 if (start_server(jq) != 0) {
451 pthread_mutex_unlock(&jq->mutex);
455 jq->running_jobs->append(je);
457 Dmsg1(2300, "Took jobid=%d from ready and appended to run\n", jcr->JobId);
459 /* Release job queue lock */
462 /* Call user's routine here */
463 Dmsg2(2300, "Calling user engine for jobid=%d use=%d\n", jcr->JobId,
467 Dmsg2(2300, "Back from user engine jobid=%d use=%d.\n", jcr->JobId,
470 /* Reacquire job queue lock */
472 Dmsg0(200, "Done lock mutex after running job. Release locks.\n");
473 jq->running_jobs->remove(je);
475 * Release locks if acquired. Note, they will not have
476 * been acquired for jobs canceled before they were
477 * put into the ready queue.
479 if (jcr->acquired_resource_locks) {
481 dec_write_store(jcr);
482 jcr->client->NumConcurrentJobs--;
483 jcr->job->NumConcurrentJobs--;
484 jcr->acquired_resource_locks = false;
488 * Reschedule the job if necessary and requested
490 if (jcr->job->RescheduleOnError &&
491 jcr->JobStatus != JS_Terminated &&
492 jcr->JobStatus != JS_Canceled &&
493 jcr->JobType == JT_BACKUP &&
494 (jcr->job->RescheduleTimes == 0 ||
495 jcr->reschedule_count < jcr->job->RescheduleTimes)) {
496 char dt[50], dt2[50];
499 * Reschedule this job by cleaning it up, but
500 * reuse the same JobId if possible.
502 time_t now = time(NULL);
503 jcr->reschedule_count++;
504 jcr->sched_time = now + jcr->job->RescheduleInterval;
505 bstrftime(dt, sizeof(dt), now);
506 bstrftime(dt2, sizeof(dt2), jcr->sched_time);
507 Dmsg4(2300, "Rescheduled Job %s to re-run in %d seconds.(now=%u,then=%u)\n", jcr->Job,
508 (int)jcr->job->RescheduleInterval, now, jcr->sched_time);
509 Jmsg(jcr, M_INFO, 0, _("Rescheduled Job %s at %s to re-run in %d seconds (%s).\n"),
510 jcr->Job, dt, (int)jcr->job->RescheduleInterval, dt2);
511 dird_free_jcr_pointers(jcr); /* partial cleanup old stuff */
513 set_jcr_job_status(jcr, JS_WaitStartTime);
514 jcr->SDJobStatus = 0;
515 if (jcr->JobBytes == 0) {
516 Dmsg2(2300, "Requeue job=%d use=%d\n", jcr->JobId, jcr->use_count());
518 jobq_add(jq, jcr); /* queue the job to run again */
520 free_jcr(jcr); /* release jcr */
521 free(je); /* free the job entry */
522 continue; /* look for another job to run */
525 * Something was actually backed up, so we cannot reuse
526 * the old JobId or there will be database record
527 * conflicts. We now create a new job, copying the
528 * appropriate fields.
530 JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr);
531 set_jcr_defaults(njcr, jcr->job);
532 njcr->reschedule_count = jcr->reschedule_count;
533 njcr->sched_time = jcr->sched_time;
534 njcr->JobLevel = jcr->JobLevel;
535 njcr->JobStatus = -1;
536 set_jcr_job_status(njcr, jcr->JobStatus);
538 copy_rstorage(njcr, jcr->rstorage, _("previous Job"));
543 copy_wstorage(njcr, jcr->wstorage, _("previous Job"));
547 njcr->messages = jcr->messages;
548 Dmsg0(2300, "Call to run new job\n");
550 run_job(njcr); /* This creates a "new" job */
551 free_jcr(njcr); /* release "new" jcr */
553 Dmsg0(2300, "Back from running new job.\n");
555 /* Clean up and release old jcr */
556 Dmsg2(2300, "====== Termination job=%d use_cnt=%d\n", jcr->JobId, jcr->use_count());
557 jcr->SDJobStatus = 0;
558 V(jq->mutex); /* release internal lock */
560 free(je); /* release job entry */
561 P(jq->mutex); /* reacquire job queue lock */
564 * If any job in the wait queue can be run,
565 * move it to the ready queue
567 Dmsg0(2300, "Done check ready, now check wait queue.\n");
568 if (!jq->waiting_jobs->empty() && !jq->quit) {
570 je = (jobq_item_t *)jq->waiting_jobs->first();
571 jobq_item_t *re = (jobq_item_t *)jq->running_jobs->first();
573 Priority = re->jcr->JobPriority;
574 Dmsg2(2300, "JobId %d is running. Look for pri=%d\n", re->jcr->JobId, Priority);
576 Priority = je->jcr->JobPriority;
577 Dmsg1(2300, "No job running. Look for Job pri=%d\n", Priority);
580 * Walk down the list of waiting jobs and attempt
581 * to acquire the resources it needs.
584 /* je is current job item on the queue, jn is the next one */
586 jobq_item_t *jn = (jobq_item_t *)jq->waiting_jobs->next(je);
588 Dmsg3(2300, "Examining Job=%d JobPri=%d want Pri=%d\n",
589 jcr->JobId, jcr->JobPriority, Priority);
591 /* Take only jobs of correct Priority */
592 if (jcr->JobPriority != Priority) {
593 set_jcr_job_status(jcr, JS_WaitPriority);
597 if (!acquire_resources(jcr)) {
598 /* If resource conflict, job is canceled */
599 if (!job_canceled(jcr)) {
600 je = jn; /* point to next waiting job */
606 * Got all locks, now remove it from wait queue and append it
607 * to the ready queue. Note, we may also get here if the
608 * job was canceled. Once it is "run", it will quickly
611 jq->waiting_jobs->remove(je);
612 jq->ready_jobs->append(je);
613 Dmsg1(2300, "moved JobId=%d from wait to ready queue\n", je->jcr->JobId);
614 je = jn; /* Point to next waiting job */
619 Dmsg0(2300, "Done checking wait queue.\n");
621 * If no more ready work and we are asked to quit, then do it
623 if (jq->ready_jobs->empty() && jq->quit) {
625 if (jq->num_workers == 0) {
626 Dmsg0(2300, "Wake up destroy routine\n");
627 /* Wake up destroy routine if he is waiting */
628 pthread_cond_broadcast(&jq->work);
632 Dmsg0(2300, "Check for work request\n");
634 * If no more work requests, and we waited long enough, quit
636 Dmsg2(2300, "timedout=%d read empty=%d\n", timedout,
637 jq->ready_jobs->empty());
638 if (jq->ready_jobs->empty() && timedout) {
639 Dmsg0(2300, "break big loop\n");
644 work = !jq->ready_jobs->empty() || !jq->waiting_jobs->empty();
647 * If a job is waiting on a Resource, don't consume all
648 * the CPU time looping looking for work, and even more
649 * important, release the lock so that a job that has
650 * terminated can give us the resource.
653 bmicrosleep(2, 0); /* pause for 2 seconds */
655 /* Recompute work as something may have changed in last 2 secs */
656 work = !jq->ready_jobs->empty() || !jq->waiting_jobs->empty();
658 Dmsg1(2300, "Loop again. work=%d\n", work);
659 } /* end of big for loop */
661 Dmsg0(200, "unlock mutex\n");
663 Dmsg0(2300, "End jobq_server\n");
668 * See if we can acquire all the necessary resources for the job (JCR)
670 * Returns: true if successful
671 * false if resource failure
673 static bool acquire_resources(JCR *jcr)
675 bool skip_this_jcr = false;
677 jcr->acquired_resource_locks = false;
678 if (jcr->rstore == jcr->wstore) { /* deadlock */
679 Jmsg(jcr, M_FATAL, 0, _("Job canceled. Attempt to read and write same device.\n"
680 " Read storage \"%s\" (From %s) -- Write storage \"%s\" (From %s)\n"),
681 jcr->rstore->name(), jcr->rstore_source, jcr->wstore->name(), jcr->wstore_source);
682 set_jcr_job_status(jcr, JS_Canceled);
686 Dmsg1(200, "Rstore=%s\n", jcr->rstore->name());
687 if (jcr->rstore->NumConcurrentJobs < jcr->rstore->MaxConcurrentJobs) {
688 jcr->rstore->NumConcurrentReadJobs++;
689 jcr->rstore->NumConcurrentJobs++;
690 Dmsg1(200, "Inc rncj=%d\n", jcr->rstore->NumConcurrentJobs);
692 Dmsg1(200, "Fail rncj=%d\n", jcr->rstore->NumConcurrentJobs);
693 set_jcr_job_status(jcr, JS_WaitStoreRes);
699 Dmsg1(200, "Wstore=%s\n", jcr->wstore->name());
700 if (jcr->wstore->NumConcurrentJobs < jcr->wstore->MaxConcurrentJobs) {
701 jcr->wstore->NumConcurrentJobs++;
702 Dmsg1(200, "Inc wncj=%d\n", jcr->wstore->NumConcurrentJobs);
703 } else if (jcr->rstore) {
705 skip_this_jcr = true;
707 Dmsg1(200, "Fail wncj=%d\n", jcr->wstore->NumConcurrentJobs);
708 skip_this_jcr = true;
712 set_jcr_job_status(jcr, JS_WaitStoreRes);
716 if (jcr->client->NumConcurrentJobs < jcr->client->MaxConcurrentJobs) {
717 jcr->client->NumConcurrentJobs++;
719 /* Back out previous locks */
720 dec_write_store(jcr);
722 set_jcr_job_status(jcr, JS_WaitClientRes);
725 if (jcr->job->NumConcurrentJobs < jcr->job->MaxConcurrentJobs) {
726 jcr->job->NumConcurrentJobs++;
728 /* Back out previous locks */
729 dec_write_store(jcr);
731 jcr->client->NumConcurrentJobs--;
732 set_jcr_job_status(jcr, JS_WaitJobRes);
736 jcr->acquired_resource_locks = true;
740 static void dec_read_store(JCR *jcr)
743 jcr->rstore->NumConcurrentReadJobs--; /* back out rstore */
744 jcr->rstore->NumConcurrentJobs--; /* back out rstore */
745 Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
746 ASSERT(jcr->rstore->NumConcurrentReadJobs >= 0);
747 ASSERT(jcr->rstore->NumConcurrentJobs >= 0);
751 static void dec_write_store(JCR *jcr)
754 jcr->wstore->NumConcurrentJobs--;
755 Dmsg1(200, "Dec wncj=%d\n", jcr->wstore->NumConcurrentJobs);
756 ASSERT(jcr->wstore->NumConcurrentJobs >= 0);