3 * Bacula Director Job processing routines
5 * Kern Sibbald, October MM
10 Copyright (C) 2000-2003 Kern Sibbald and John Walker
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of
15 the License, or (at your option) any later version.
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public
23 License along with this program; if not, write to the Free
24 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
32 /* Forward referenced subroutines */
33 static void *job_thread(void *arg);
34 static int acquire_resource_locks(JCR *jcr);
36 static void backoff_resource_locks(JCR *jcr, int count);
37 static void release_resource_locks(JCR *jcr);
40 /* Exported subroutines */
43 /* Imported subroutines */
44 extern void term_scheduler();
45 extern void term_ua_server();
46 extern int do_backup(JCR *jcr);
47 extern int do_admin(JCR *jcr);
48 extern int do_restore(JCR *jcr);
49 extern int do_verify(JCR *jcr);
52 static semlock_t job_lock;
53 static pthread_mutex_t mutex;
54 static pthread_cond_t resource_wait;
55 static int waiting = 0; /* count of waiting threads */
62 void init_job_server(int max_workers)
66 if ((stat = sem_init(&job_lock, max_workers)) != 0) {
67 Emsg1(M_ABORT, 0, _("Could not init job lock: ERR=%s\n"), strerror(stat));
69 if ((stat = pthread_mutex_init(&mutex, NULL)) != 0) {
70 Emsg1(M_ABORT, 0, _("Could not init resource mutex: ERR=%s\n"), strerror(stat));
72 if ((stat = pthread_cond_init(&resource_wait, NULL)) != 0) {
73 Emsg1(M_ABORT, 0, _("Could not init resource wait: ERR=%s\n"), strerror(stat));
78 if ((stat = jobq_init(&job_queue, max_workers, job_thread)) != 0) {
79 Emsg1(M_ABORT, 0, _("Could not init job queue: ERR=%s\n"), strerror(stat));
87 * Run a job -- typically called by the scheduler, but may also
88 * be called by the UA (Console program).
91 void run_job(JCR *jcr)
98 sm_check(__FILE__, __LINE__, True);
99 init_msg(jcr, jcr->messages);
100 create_unique_job_name(jcr, jcr->job->hdr.name);
101 set_jcr_job_status(jcr, JS_Created);
102 jcr->jr.SchedTime = jcr->sched_time;
103 jcr->jr.StartTime = jcr->start_time;
104 jcr->jr.Type = jcr->JobType;
105 jcr->jr.Level = jcr->JobLevel;
106 jcr->jr.JobStatus = jcr->JobStatus;
107 bstrncpy(jcr->jr.Name, jcr->job->hdr.name, sizeof(jcr->jr.Name));
108 bstrncpy(jcr->jr.Job, jcr->Job, sizeof(jcr->jr.Job));
110 /* Initialize termination condition variable */
111 if ((errstat = pthread_cond_init(&jcr->term_wait, NULL)) != 0) {
112 Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), strerror(errstat));
113 set_jcr_job_status(jcr, JS_ErrorTerminated);
121 Dmsg0(50, "Open database\n");
122 jcr->db=db_init_database(jcr, jcr->catalog->db_name, jcr->catalog->db_user,
123 jcr->catalog->db_password, jcr->catalog->db_address,
124 jcr->catalog->db_port, jcr->catalog->db_socket);
125 if (!jcr->db || !db_open_database(jcr, jcr->db)) {
126 Jmsg(jcr, M_FATAL, 0, _("Could not open database \"%s\".\n"),
127 jcr->catalog->db_name);
129 Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db));
131 set_jcr_job_status(jcr, JS_ErrorTerminated);
135 Dmsg0(50, "DB opened\n");
140 jcr->jr.JobStatus = jcr->JobStatus;
141 if (!db_create_job_record(jcr, jcr->db, &jcr->jr)) {
142 Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db));
143 set_jcr_job_status(jcr, JS_ErrorTerminated);
147 jcr->JobId = jcr->jr.JobId;
148 ASSERT(jcr->jr.JobId > 0);
150 Dmsg4(50, "Created job record JobId=%d Name=%s Type=%c Level=%c\n",
151 jcr->JobId, jcr->Job, jcr->jr.Type, jcr->jr.Level);
152 Dmsg0(200, "Add jrc to work queue\n");
155 if ((stat = pthread_create(&tid, NULL, job_thread, (void *)jcr)) != 0) {
156 Emsg1(M_ABORT, 0, _("Unable to create job thread: ERR=%s\n"), strerror(stat));
160 /* Queue the job to be run */
161 if ((stat = jobq_add(&job_queue, jcr)) != 0) {
162 Emsg1(M_ABORT, 0, _("Could not add job queue: ERR=%s\n"), strerror(stat));
166 Dmsg0(100, "Done run_job()\n");
170 * This is the engine called by workq_add() when we were pulled
171 * from the work queue.
172 * At this point, we are running in our own thread
174 static void *job_thread(void *arg)
176 JCR *jcr = (JCR *)arg;
178 pthread_detach(pthread_self());
179 sm_check(__FILE__, __LINE__, True);
182 if (!acquire_resource_locks(jcr)) {
183 set_jcr_job_status(jcr, JS_Canceled);
186 Dmsg0(200, "=====Start Job=========\n");
187 jcr->start_time = time(NULL); /* set the real start time */
188 set_jcr_job_status(jcr, JS_Running);
190 if (job_canceled(jcr)) {
191 update_job_end_record(jcr);
192 } else if (jcr->job->MaxStartDelay != 0 && jcr->job->MaxStartDelay <
193 (utime_t)(jcr->start_time - jcr->sched_time)) {
194 Jmsg(jcr, M_FATAL, 0, _("Job canceled because max start delay time exceeded.\n"));
195 set_jcr_job_status(jcr, JS_Canceled);
196 update_job_end_record(jcr);
200 if (jcr->job->RunBeforeJob) {
201 POOLMEM *before = get_pool_memory(PM_FNAME);
204 char line[MAXSTRING];
206 before = edit_job_codes(jcr, before, jcr->job->RunBeforeJob, "");
207 bpipe = open_bpipe(before, 0, "r");
208 free_pool_memory(before);
209 while (fgets(line, sizeof(line), bpipe->rfd)) {
210 Jmsg(jcr, M_INFO, 0, _("RunBefore: %s"), line);
212 status = close_bpipe(bpipe);
214 Jmsg(jcr, M_FATAL, 0, _("RunBeforeJob returned non-zero status=%d\n"),
216 set_jcr_job_status(jcr, JS_FatalError);
217 update_job_end_record(jcr);
221 switch (jcr->JobType) {
224 if (jcr->JobStatus == JS_Terminated) {
230 if (jcr->JobStatus == JS_Terminated) {
236 if (jcr->JobStatus == JS_Terminated) {
242 if (jcr->JobStatus == JS_Terminated) {
247 Pmsg1(0, "Unimplemented job type: %d\n", jcr->JobType);
250 if (jcr->job->RunAfterJob) {
251 POOLMEM *after = get_pool_memory(PM_FNAME);
254 char line[MAXSTRING];
256 after = edit_job_codes(jcr, after, jcr->job->RunAfterJob, "");
257 bpipe = open_bpipe(after, 0, "r");
258 free_pool_memory(after);
259 while (fgets(line, sizeof(line), bpipe->rfd)) {
260 Jmsg(jcr, M_INFO, 0, _("RunAfter: %s"), line);
262 status = close_bpipe(bpipe);
264 Jmsg(jcr, M_FATAL, 0, _("RunAfterJob returned non-zero status=%d\n"),
266 set_jcr_job_status(jcr, JS_FatalError);
267 update_job_end_record(jcr);
273 release_resource_locks(jcr);
274 if (jcr->job->RescheduleOnError &&
275 jcr->JobStatus != JS_Terminated &&
276 jcr->JobStatus != JS_Canceled &&
277 jcr->job->RescheduleTimes > 0 &&
278 jcr->reschedule_count < jcr->job->RescheduleTimes) {
281 * Reschedule this job by cleaning it up, but
282 * reuse the same JobId if possible.
284 jcr->reschedule_count++;
285 jcr->sched_time = time(NULL) + jcr->job->RescheduleInterval;
286 Dmsg2(100, "Rescheduled Job %s to re-run in %d seconds.\n", jcr->Job,
287 (int)jcr->job->RescheduleInterval);
288 jcr->JobStatus = JS_Created; /* force new status */
289 dird_free_jcr(jcr); /* partial cleanup old stuff */
290 if (jcr->JobBytes == 0) {
291 continue; /* reschedule the job */
294 * Something was actually backed up, so we cannot reuse
295 * the old JobId or there will be database record
296 * conflicts. We now create a new job, copying the
297 * appropriate fields.
299 JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr);
300 set_jcr_defaults(njcr, jcr->job);
301 njcr->reschedule_count = jcr->reschedule_count;
302 njcr->JobLevel = jcr->JobLevel;
303 njcr->JobStatus = jcr->JobStatus;
304 njcr->pool = jcr->pool;
305 njcr->store = jcr->store;
306 njcr->messages = jcr->messages;
315 Dmsg0(200, "Close DB\n");
316 db_close_database(jcr, jcr->db);
321 Dmsg0(50, "======== End Job ==========\n");
322 sm_check(__FILE__, __LINE__, True);
327 * Acquire the resources needed. These locks limit the
328 * number of jobs by each resource. We have limits on
329 * Jobs, Clients, Storage, and total jobs.
331 static int acquire_resource_locks(JCR *jcr)
334 time_t now = time(NULL);
335 time_t wtime = jcr->sched_time - now;
337 /* Wait until scheduled time arrives */
338 if (wtime > 0 && verbose) {
339 Jmsg(jcr, M_INFO, 0, _("Job %s waiting %d seconds for scheduled start time.\n"),
341 set_jcr_job_status(jcr, JS_WaitStartTime);
343 /* Check every 30 seconds if canceled */
345 Dmsg2(100, "Waiting on sched time, jobid=%d secs=%d\n", jcr->JobId, wtime);
349 bmicrosleep(wtime, 0);
350 if (job_canceled(jcr)) {
353 wtime = jcr->sched_time - time(NULL);
361 /* Initialize semaphores */
362 if (jcr->store->sem.valid != SEMLOCK_VALID) {
363 if ((stat = sem_init(&jcr->store->sem, jcr->store->MaxConcurrentJobs)) != 0) {
364 Emsg1(M_ABORT, 0, _("Could not init Storage semaphore: ERR=%s\n"), strerror(stat));
367 if (jcr->client->sem.valid != SEMLOCK_VALID) {
368 if ((stat = sem_init(&jcr->client->sem, jcr->client->MaxConcurrentJobs)) != 0) {
369 Emsg1(M_ABORT, 0, _("Could not init Client semaphore: ERR=%s\n"), strerror(stat));
372 if (jcr->job->sem.valid != SEMLOCK_VALID) {
373 if ((stat = sem_init(&jcr->job->sem, jcr->job->MaxConcurrentJobs)) != 0) {
374 Emsg1(M_ABORT, 0, _("Could not init Job semaphore: ERR=%s\n"), strerror(stat));
379 /* Acquire semaphore */
380 set_jcr_job_status(jcr, JS_WaitJobRes);
381 if ((stat = sem_lock(&jcr->job->sem)) != 0) {
382 Emsg1(M_ABORT, 0, _("Could not acquire Job max jobs lock: ERR=%s\n"), strerror(stat));
384 set_jcr_job_status(jcr, JS_WaitClientRes);
385 if ((stat = sem_trylock(&jcr->client->sem)) != 0) {
387 backoff_resource_locks(jcr, 1);
390 Emsg1(M_ABORT, 0, _("Could not acquire Client max jobs lock: ERR=%s\n"), strerror(stat));
393 set_jcr_job_status(jcr, JS_WaitStoreRes);
394 if ((stat = sem_trylock(&jcr->store->sem)) != 0) {
396 backoff_resource_locks(jcr, 2);
399 Emsg1(M_ABORT, 0, _("Could not acquire Storage max jobs lock: ERR=%s\n"), strerror(stat));
402 set_jcr_job_status(jcr, JS_WaitMaxJobs);
403 if ((stat = sem_trylock(&job_lock)) != 0) {
405 backoff_resource_locks(jcr, 3);
408 Emsg1(M_ABORT, 0, _("Could not acquire max jobs lock: ERR=%s\n"), strerror(stat));
414 if (job_canceled(jcr)) {
419 * Wait for a resource to be released either by backoff or
420 * by a job terminating.
423 pthread_cond_wait(&resource_wait, &mutex);
428 jcr->acquired_resource_locks = true;
435 * We could not get all the resource locks because
436 * too many jobs are running, so release any locks
437 * we did acquire, giving others a chance to use them
440 static void backoff_resource_locks(JCR *jcr, int count)
445 sem_unlock(&jcr->store->sem);
446 /* Fall through wanted */
448 sem_unlock(&jcr->client->sem);
449 /* Fall through wanted */
451 sem_unlock(&jcr->job->sem);
455 * Since we released a lock, if there are any threads
456 * waiting, wake them up so that they can try again.
459 pthread_cond_broadcast(&resource_wait);
466 * This is called at the end of the job to release
467 * any resource limits on the number of jobs. If
468 * there are any other jobs waiting, we wake them
469 * up so that they can try again.
472 static void release_resource_locks(JCR *jcr)
474 if (!jcr->acquired_resource_locks) {
475 return; /* Job canceled, no locks acquired */
478 sem_unlock(&jcr->store->sem);
479 sem_unlock(&jcr->client->sem);
480 sem_unlock(&jcr->job->sem);
481 sem_unlock(&job_lock);
483 pthread_cond_broadcast(&resource_wait);
485 jcr->acquired_resource_locks = false;
491 * Get or create a Client record for this Job
493 int get_or_create_client_record(JCR *jcr)
497 memset(&cr, 0, sizeof(cr));
498 bstrncpy(cr.Name, jcr->client->hdr.name, sizeof(cr.Name));
499 cr.AutoPrune = jcr->client->AutoPrune;
500 cr.FileRetention = jcr->client->FileRetention;
501 cr.JobRetention = jcr->client->JobRetention;
502 if (!jcr->client_name) {
503 jcr->client_name = get_pool_memory(PM_NAME);
505 pm_strcpy(&jcr->client_name, jcr->client->hdr.name);
506 if (!db_create_client_record(jcr, jcr->db, &cr)) {
507 Jmsg(jcr, M_FATAL, 0, _("Could not create Client record. ERR=%s\n"),
508 db_strerror(jcr->db));
511 jcr->jr.ClientId = cr.ClientId;
513 if (!jcr->client_uname) {
514 jcr->client_uname = get_pool_memory(PM_NAME);
516 pm_strcpy(&jcr->client_uname, cr.Uname);
518 Dmsg2(100, "Created Client %s record %d\n", jcr->client->hdr.name,
525 * Write status and such in DB
527 void update_job_end_record(JCR *jcr)
529 if (jcr->jr.EndTime == 0) {
530 jcr->jr.EndTime = time(NULL);
532 jcr->end_time = jcr->jr.EndTime;
533 jcr->jr.JobId = jcr->JobId;
534 jcr->jr.JobStatus = jcr->JobStatus;
535 jcr->jr.JobFiles = jcr->JobFiles;
536 jcr->jr.JobBytes = jcr->JobBytes;
537 jcr->jr.VolSessionId = jcr->VolSessionId;
538 jcr->jr.VolSessionTime = jcr->VolSessionTime;
539 if (!db_update_job_end_record(jcr, jcr->db, &jcr->jr)) {
540 Jmsg(jcr, M_WARNING, 0, _("Error updating job record. %s"),
541 db_strerror(jcr->db));
546 * Takes base_name and appends (unique) current
547 * date and time to form unique job name.
549 * Returns: unique job name in jcr->Job
550 * date/time in jcr->start_time
552 void create_unique_job_name(JCR *jcr, char *base_name)
554 /* Job start mutex */
555 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
556 static time_t last_start_time = 0;
559 char dt[MAX_TIME_LENGTH];
560 char name[MAX_NAME_LENGTH];
563 /* Guarantee unique start time -- maximum one per second, and
564 * thus unique Job Name
566 P(mutex); /* lock creation of jobs */
568 while (now == last_start_time) {
569 bmicrosleep(0, 500000);
572 last_start_time = now;
573 V(mutex); /* allow creation of jobs */
574 jcr->start_time = now;
575 /* Form Unique JobName */
576 localtime_r(&now, &tm);
577 /* Use only characters that are permitted in Windows filenames */
578 strftime(dt, sizeof(dt), "%Y-%m-%d_%H.%M.%S", &tm);
579 bstrncpy(name, base_name, sizeof(name));
580 name[sizeof(name)-22] = 0; /* truncate if too long */
581 bsnprintf(jcr->Job, sizeof(jcr->Job), "%s.%s", name, dt); /* add date & time */
582 /* Convert spaces into underscores */
583 for (p=jcr->Job; *p; p++) {
591 * Free the Job Control Record if no one is still using it.
592 * Called from main free_jcr() routine in src/lib/jcr.c so
593 * that we can do our Director specific cleanup of the jcr.
595 void dird_free_jcr(JCR *jcr)
597 Dmsg0(200, "Start dird free_jcr\n");
599 if (jcr->sd_auth_key) {
600 free(jcr->sd_auth_key);
601 jcr->sd_auth_key = NULL;
607 if (jcr->file_bsock) {
608 Dmsg0(200, "Close File bsock\n");
609 bnet_close(jcr->file_bsock);
610 jcr->file_bsock = NULL;
612 if (jcr->store_bsock) {
613 Dmsg0(200, "Close Store bsock\n");
614 bnet_close(jcr->store_bsock);
615 jcr->store_bsock = NULL;
618 Dmsg0(200, "Free JCR fname\n");
619 free_pool_memory(jcr->fname);
623 Dmsg0(200, "Free JCR stime\n");
624 free_pool_memory(jcr->stime);
627 if (jcr->RestoreBootstrap) {
628 free(jcr->RestoreBootstrap);
629 jcr->RestoreBootstrap = NULL;
631 if (jcr->client_uname) {
632 free_pool_memory(jcr->client_uname);
633 jcr->client_uname = NULL;
635 Dmsg0(200, "End dird free_jcr\n");
639 * Set some defaults in the JCR necessary to
640 * run. These items are pulled from the job
641 * definition as defaults, but can be overridden
642 * later either by the Run record in the Schedule resource,
643 * or by the Console program.
645 void set_jcr_defaults(JCR *jcr, JOB *job)
648 jcr->JobType = job->JobType;
649 jcr->JobLevel = job->level;
650 jcr->JobPriority = job->Priority;
651 jcr->store = job->storage;
652 jcr->client = job->client;
653 if (!jcr->client_name) {
654 jcr->client_name = get_pool_memory(PM_NAME);
656 pm_strcpy(&jcr->client_name, jcr->client->hdr.name);
657 jcr->pool = job->pool;
658 jcr->catalog = job->client->catalog;
659 jcr->fileset = job->fileset;
660 jcr->messages = job->messages;
661 if (jcr->RestoreBootstrap) {
662 free(jcr->RestoreBootstrap);
664 /* This can be overridden by Console program */
665 if (job->RestoreBootstrap) {
666 jcr->RestoreBootstrap = bstrdup(job->RestoreBootstrap);
668 /* If no default level given, set one */
669 if (jcr->JobLevel == 0) {
670 switch (jcr->JobType) {
672 jcr->JobLevel = L_VERIFY_CATALOG;
675 jcr->JobLevel = L_INCREMENTAL;
679 jcr->JobLevel = L_FULL;